engine/modules/regex.cpp

/*
 * Copyright 2003. Vladimir Prus
 * Distributed under the Boost Software License, Version 1.0.
 * (See accompanying file LICENSE_1_0.txt or copy at
 * http://www.boost.org/LICENSE_1_0.txt)
 */

#include "../mem.h"
#include "../native.h"
#include "../jam_strings.h"
#include "../subst.h"

/*
rule split ( string separator )
{
    local result ;
    local s = $(string) ;

    local match = 1 ;
    while $(match)
    {
        match = [ MATCH ^(.*)($(separator))(.*) : $(s) ] ;
        if $(match)
        {
            match += "" ;  # in case 3rd item was empty - works around MATCH bug
            result = $(match[3]) $(result) ;
            s = $(match[1]) ;
        }
    }
    return $(s) $(result) ;
}
*/

LIST * regex_split( FRAME * frame, int flags )
{
    LIST * args = lol_get( frame->args, 0 );
    OBJECT * s;
    OBJECT * separator;
    regexp * re;
    const char * pos, * prev;
    LIST * result = L0;
    LISTITER iter = list_begin( args );
    s = list_item( iter );
    separator = list_item( list_next( iter ) );

    re = regex_compile( separator );

    prev = pos = object_str( s );
    while ( regexec( re, pos ) )
    {
        result = list_push_back( result, object_new_range( prev, re->startp[ 0 ] - prev ) );
        prev = re->endp[ 0 ];
        /* Handle empty matches */
        if ( *pos == '\0' )
            break;
        else if ( pos == re->endp[ 0 ] )
            pos++;
        else
            pos = re->endp[ 0 ];
    }

    result = list_push_back( result, object_new( pos ) );

    return result;
}

/*
rule replace (
    string  # The string to modify.
    match  # The characters to replace.
    replacement  # The string to replace with.
    )
{
    local result = "" ;
    local parts = 1 ;
    while $(parts)
    {
        parts = [ MATCH ^(.*)($(match))(.*) : $(string) ] ;
        if $(parts)
        {
            parts += "" ;
            result = "$(replacement)$(parts[3])$(result)" ;
            string = $(parts[1]) ;
        }
    }
    string ?= "" ;
    result = "$(string)$(result)" ;
    return $(result) ;
}
*/

LIST * regex_replace( FRAME * frame, int flags )
{
    LIST * args = lol_get( frame->args, 0 );
    OBJECT * s;
    OBJECT * match;
    OBJECT * replacement;
    regexp * re;
    const char * pos;
    string buf[ 1 ];
    LIST * result;
    LISTITER iter = list_begin( args );
    s = list_item( iter );
    iter = list_next( iter );
    match = list_item( iter );
    iter = list_next( iter );
    replacement = list_item(iter );

    re = regex_compile( match );

    string_new( buf );

    pos = object_str( s );
    while ( regexec( re, pos ) )
    {
        string_append_range( buf, pos, re->startp[ 0 ] );
        string_append( buf, object_str( replacement ) );
        /* Handle empty matches */
        if ( *pos == '\0' )
            break;
        else if ( pos == re->endp[ 0 ] )
            string_push_back( buf, *pos++ );
        else
            pos = re->endp[ 0 ];
    }
    string_append( buf, pos );

    result = list_new( object_new( buf->value ) );

    string_free( buf );

    return result;
}

/*
rule transform ( list * : pattern : indices * )
{
    indices ?= 1 ;
    local result ;
    for local e in $(list)
    {
        local m = [ MATCH $(pattern) : $(e) ] ;
        if $(m)
        {
            result += $(m[$(indices)]) ;
        }
    }
    return $(result) ;
}
*/

LIST * regex_transform( FRAME * frame, int flags )
{
    LIST * const l = lol_get( frame->args, 0 );
    LIST * const pattern = lol_get( frame->args, 1 );
    LIST * const indices_list = lol_get( frame->args, 2 );
    int * indices = 0;
    int size;
    LIST * result = L0;

    if ( !list_empty( indices_list ) )
    {
        int * p;
        LISTITER iter = list_begin( indices_list );
        LISTITER const end = list_end( indices_list );
        size = list_length( indices_list );
        indices = (int *)BJAM_MALLOC( size * sizeof( int ) );
        for ( p = indices; iter != end; iter = list_next( iter ) )
            *p++ = atoi( object_str( list_item( iter ) ) );
    }
    else
    {
        size = 1;
        indices = (int *)BJAM_MALLOC( sizeof( int ) );
        *indices = 1;
    }

    {
        /* Result is cached and intentionally never freed */
        regexp * const re = regex_compile( list_front( pattern ) );

        LISTITER iter = list_begin( l );
        LISTITER const end = list_end( l );

        string buf[ 1 ];
        string_new( buf );

        for ( ; iter != end; iter = list_next( iter ) )
        {
            if ( regexec( re, object_str( list_item( iter ) ) ) )
            {
                int i = 0;
                for ( ; i < size; ++i )
                {
                    int const index = indices[ i ];
                    /* Skip empty submatches. Not sure it is right in all cases,
                     * but surely is right for the case for which this routine
                     * is optimized -- header scanning.
                     */
                    if ( re->startp[ index ] != re->endp[ index ] )
                    {
                        string_append_range( buf, re->startp[ index ],
                            re->endp[ index ] );
                        result = list_push_back( result, object_new( buf->value
                            ) );
                        string_truncate( buf, 0 );
                    }
                }
            }
        }
        string_free( buf );
    }

    BJAM_FREE( indices );
    return result;
}


void init_regex()
{
    {
        char const * args[] = { "string", "separator", 0  };
        declare_native_rule( "regex", "split", args, regex_split, 1 );
    }
    {
        char const * args[] = { "string", "match", "replacement", 0  };
        declare_native_rule( "regex", "replace", args, regex_replace, 1 );
    }
    {
        char const * args[] = { "list", "*", ":", "pattern", ":", "indices", "*", 0 };
        declare_native_rule( "regex", "transform", args, regex_transform, 2 );
    }
}