Tokenizes a word, a subword or a list into a list. If possible makes all words as subwords. Backslashes and bars in words are preserved (i.e. words are not purified). Comments and line continuations are ignored.
If the input is a list then all its elements are tokenized one-by-one.
Return value is the tokenized list. In there is an error, returns an error atom which error code is ERROR_INCOMPLETE_PAIR, error position points the position in the word (0-based) and the error source is the word itself.
{
if( IS_LIST(input) )
{
if( method==TOKENIZE_DATA &&
GET_FLAGS( input, FLAG_TOKENIZED_DATA|FLAG_TOKENIZED_COMMANDS ) )
return USE(input);
if( method==TOKENIZE_COMMANDS &&
GET_FLAGS( input, FLAG_TOKENIZED_COMMANDS ) )
return USE(input);
}
if( IS_FLOAT(input) )
{
return USE(input);
}
#ifdef SAFEMODE
assert( IS_WORD(input) || IS_SUBWORD(input) || IS_LIST(input) );
#endif
if( IS_LIST(input) )
{
atom_t result = empty_list;
atom_t last = empty_list;
atom_t x;
atom_t y;
for( x=input; IS_NOT_EMPTY(x); x=CDR(x) )
{
int submethod = IS_LIST(CAR(x))?TOKENIZE_DATA:method;
atom_t element = tokenize( CAR(x), submethod );
if( IS_ERROR(element) )
{
DEUSE( result );
result = element;
break;
}
int initial_flags = GET_FLAGS( x, FLAG_NEWLINE|FLAG_AFTER_SPACE );
int final_flags = GET_FLAGS( x, FLAG_BEFORE_SPACE );
if( IS_FLOAT(element) )
{
append( USE(element), &result, &last );
SET_FLAGS( last, initial_flags|final_flags );
}
else if( IS_LIST(CAR(x)) )
{
if( IS_EXTENDED(x) )
{
append_ex( USE(element), &result, &last );
DEUSE( POS(last) );
POS( last ) = USE( POS(x) );
}
else
append( USE(element), &result, &last );
SET_FLAGS( last, initial_flags|final_flags );
}
else
{
for( y=element; IS_NOT_EMPTY(y); y=CDR(y) )
{
if( IS_EXTENDED(y) )
{
append_ex( USE(CAR(y)), &result, &last );
DEUSE( POS(last) );
POS( last ) = USE( POS(y) );
}
else
append( USE(CAR(y)), &result, &last );
SET_FLAGS( last, FLAGS(y) );
if( y==element ) SET_FLAGS( last, initial_flags );
}
#ifdef SAFE_MODE
assert( IS_NOT_EMPTY(last) );
#endif
SET_FLAGS( last, final_flags);
}
DEUSE( element );
}
return result;
}
chars_t source = STRING(input);
int len = LENGTH(input);
int origlen = len;
chars_t buffer = ALLOC( CHAR_SIZE*len );
char_t ch;
int_t errpos = -1;
int last_token = TOKEN_LINEEND;
int get_token( atom_t *token, int method )
{
int is_number(chars_t bp)
{
chars_t cp = bp;
int num_mode = 1;
char_t ch;
cp = bp;
if( bp==buffer ) return 0;
while( cp>buffer )
{
cp--;
ch = *cp;
switch( num_mode )
{
case 1:
if( ch!=TEXT('E') && ch!=TEXT('e') ) return 0;
num_mode = 2;
break;
case 2: ;
if( ch<TEXT('0') || ch>TEXT('9') ) return 0;
num_mode = 3;
break;
case 3: ;
if( ch!=TEXT('.') ) return 0;
num_mode = 4;
break;
case 4: ;
if( ch<TEXT('0') || ch>TEXT('9') ) return 0;
break;
}
}
return 1;
}
if( !len ) return TOKEN_END;
int dirty = 0;
int mutated = 0;
chars_t bp = buffer; *bp=NULL_CHAR;
chars_t sp = source;
int mode = MODE_ENTRY;
int code;
int stack[MAX_MODE];
static int mode_eof[MAX_MODE] =
{
PAT_TOKEN_END,
PAT_TOKEN_SPACE,
PAT_TOKEN_WORD,
PAT_ERROR,
PAT_ERROR,
PAT_ERROR,
PAT_ERROR,
PAT_RETURN,
PAT_ERROR,
PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
static int mode_eol[MAX_MODE] =
{
PAT_NEXT+PAT_TOKEN_LINE,
PAT_TOKEN_SPACE,
PAT_TOKEN_WORD,
PAT_PUSH+PAT_NEXT,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_NEXT+PAT_RETURN,
PAT_NEXT+PAT_RETURN,
PAT_RETURN,
PAT_NEXT+PAT_RETURN,
PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
static int mode_space[MAX_MODE] =
{
PAT_NEXT+PAT_GOTO_SPACE,
PAT_NEXT,
PAT_TOKEN_WORD,
PAT_PUSH+PAT_NEXT,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_NEXT+PAT_GOTO_TILDESPACE,
PAT_NEXT,
PAT_NEXT,
PAT_NEXT,
PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
static int mode_open[MAX_MODE] =
{
PAT_NEXT+PAT_TOKEN_OPEN,
PAT_TOKEN_SPACE,
PAT_TOKEN_WORD,
PAT_PUSH+PAT_NEXT,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_TILDE+PAT_RETURN,
PAT_RETURN,
PAT_NEXT,
PAT_TILDE+PAT_TOKEN_WORD,
PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
static int mode_close[MAX_MODE] =
{
PAT_NEXT+PAT_TOKEN_CLOSE,
PAT_TOKEN_SPACE,
PAT_TOKEN_WORD,
PAT_PUSH+PAT_NEXT,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_TILDE+PAT_RETURN,
PAT_RETURN,
PAT_NEXT,
PAT_TILDE+PAT_TOKEN_WORD,
PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
static int mode_bar[MAX_MODE] =
{
PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_RETURN_TO_WORD+PAT_GOTO_BARRED+PAT_ERROR_POS,
PAT_TOKEN_SPACE,
PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_GOTO_BARRED+PAT_RETURN_TO_SELF+PAT_ERROR_POS,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_TILDE+PAT_RETURN,
PAT_RETURN,
PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_GOTO_BARRED+PAT_RETURN_TO_SELF+PAT_ERROR_POS,
PAT_TILDE+PAT_TOKEN_WORD,
PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
static int mode_backslash[MAX_MODE] =
{
PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_RETURN_TO_WORD+PAT_GOTO_BACKSLASHED+PAT_ERROR_POS,
PAT_TOKEN_SPACE,
PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_RETURN_TO_SELF+PAT_GOTO_BACKSLASHED+PAT_ERROR_POS,
PAT_PUSH+PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_BACKSLASHED,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_TILDE+PAT_RETURN,
PAT_RETURN,
PAT_NEXT2,
PAT_TILDE+PAT_TOKEN_WORD,
PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
static int mode_tilde[MAX_MODE] =
{
PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_TILDE+PAT_ERROR_POS,
PAT_TOKEN_SPACE,
PAT_NEXT+PAT_MUTATED+PAT_RETURN_TO_SELF+PAT_GOTO_TILDE+PAT_ERROR_POS,
PAT_PUSH+PAT_NEXT,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_TILDE+PAT_RETURN,
PAT_RETURN,
PAT_NEXT+PAT_MUTATED+PAT_RETURN_TO_CALLER+PAT_GOTO_SEMITILDE+PAT_ERROR_POS,
PAT_TILDE+PAT_TOKEN_WORD,
PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
static int mode_semicolon[MAX_MODE] =
{
PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_SEMICOLON,
PAT_TOKEN_SPACE,
PAT_NEXT+PAT_MUTATED+PAT_RETURN_TO_SELF+PAT_GOTO_SEMICOLON,
PAT_PUSH+PAT_NEXT,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_TILDE+PAT_RETURN,
PAT_RETURN,
PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_SEMITILDE,
PAT_TILDE+PAT_TOKEN_WORD,
PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
static int mode_else[MAX_MODE] =
{
PAT_PUSH+PAT_NEXT+PAT_GOTO_WORD,
PAT_TOKEN_SPACE,
PAT_PUSH+PAT_NEXT,
PAT_PUSH+PAT_NEXT,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_TILDE+PAT_RETURN,
PAT_RETURN,
PAT_NEXT,
PAT_TILDE+PAT_TOKEN_WORD,
PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
static int mode_parens[MAX_MODE] =
{
PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
PAT_TOKEN_SPACE,
PAT_TOKEN_WORD,
PAT_PUSH+PAT_NEXT,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_TILDE+PAT_RETURN,
PAT_RETURN,
PAT_NEXT,
PAT_TILDE+PAT_TOKEN_WORD,
PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
static int mode_equal[MAX_MODE] =
{
PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
PAT_TOKEN_SPACE,
PAT_TOKEN_WORD,
PAT_PUSH+PAT_NEXT,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_TILDE+PAT_RETURN,
PAT_RETURN,
PAT_NEXT,
PAT_TILDE+PAT_TOKEN_WORD,
PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD
};
static int mode_less[MAX_MODE] =
{
PAT_PUSH+PAT_NEXT+PAT_GOTO_LESS,
PAT_PUSH+PAT_NEXT+PAT_GOTO_LESS,
PAT_TOKEN_WORD,
PAT_PUSH+PAT_NEXT,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_TILDE+PAT_RETURN,
PAT_RETURN,
PAT_NEXT,
PAT_TILDE+PAT_TOKEN_WORD,
PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
static int mode_greater[MAX_MODE] =
{
PAT_PUSH+PAT_NEXT+PAT_GOTO_GREATER,
PAT_PUSH+PAT_NEXT+PAT_GOTO_GREATER,
PAT_TOKEN_WORD,
PAT_PUSH+PAT_NEXT,
PAT_PUSH+PAT_NEXT+PAT_RETURN,
PAT_TILDE+PAT_RETURN,
PAT_RETURN,
PAT_NEXT,
PAT_TILDE+PAT_TOKEN_WORD,
PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
PAT_TOKEN_WORD
};
again:
GET_CHAR;
#ifdef DEBUG_TOKENS
printf("<TOKENS> length=%d\n",len);
if(ch<TEXT(' '))
printf("<TOKENS> get(#%d)\t",ch);
else
printf("<TOKENS> get('%c')\t",ch);
switch( mode )
{
case MODE_ENTRY: printf(" ENTRY -> "); break;
case MODE_SPACE: printf(" SPACE -> "); break;
case MODE_WORD: printf(" WORD -> "); break;
case MODE_BARRED: printf(" BARRED -> "); break;
case MODE_BACKSLASHED: printf(" BACKSLASH -> "); break;
case MODE_TILDE: printf(" TILDE -> "); break;
case MODE_SEMITILDE: printf(" SEMITILDE -> "); break;
case MODE_SEMICOLON: printf(" SEMICOLON -> "); break;
case MODE_TILDESPACE: printf("TILDESPACE -> "); break;
}
#endif
code = mode_else[mode];
if( !len )
code = mode_eof[mode];
else
{
if( method==TOKENIZE_READWORD )
{
if( ch==TEXT('|') ) code = mode_bar[mode];
else if( ch==TEXT('\\') ) code = mode_backslash[mode];
}
else
if( method==TOKENIZE_READLIST )
{
if( ch==TEXT('\n') ) code = mode_eol[mode];
else if( ch==TEXT('\r') ) code = mode_eol[mode];
else if( ch<=TEXT(' ') ) code = mode_space[mode];
else if( ch==TEXT('[') ) code = mode_open[mode];
else if( ch==TEXT(']') ) code = mode_close[mode];
else if( ch==TEXT('|') ) code = mode_bar[mode];
else if( ch==TEXT('\\') ) code = mode_backslash[mode];
else if( ch==TEXT('~') ) code = mode_tilde[mode];
}
else
if( method==TOKENIZE_DATA )
{
if( ch==TEXT('\n') ) code = mode_eol[mode];
else if( ch==TEXT('\r') ) code = mode_eol[mode];
else if( ch<=TEXT(' ') ) code = mode_space[mode];
else if( ch==TEXT('[') ) code = mode_open[mode];
else if( ch==TEXT(']') ) code = mode_close[mode];
else if( ch==TEXT('|') ) code = mode_bar[mode];
else if( ch==TEXT('\\') ) code = mode_backslash[mode];
else if( ch==TEXT('~') ) code = mode_tilde[mode];
else if( ch==TEXT(';') ) code = mode_semicolon[mode];
}
else
{
if( ch==TEXT('(') ||
ch==TEXT(')') ) code = mode_parens[mode];
else if( *buffer!=TEXT('"') )
{
if( (ch==TEXT('+') || ch==TEXT('-')) && is_number(bp) ) { }
else if( ch==TEXT('+') ||
ch==TEXT('-') ||
ch==TEXT('*') ||
ch==TEXT('/')) code = mode_parens[mode];
else if( ch==TEXT('=') ) code = mode_equal[mode];
else if( ch==TEXT('<') ) code = mode_less[mode];
else if( ch==TEXT('>') ) code = mode_greater[mode];
}
}
}
int _stack = -1;
int newmode = 0;
if( code & PAT_GOTO )
{
newmode = (code>>PAT_SHIFT) & 0xF;
if( mode==MODE_SPACE && newmode!=MODE_SPACE ) { sp = source; }
}
if( code & PAT_TILDE )
{
*bp++ = TEXT('~');
#ifdef DEBUG_TOKENS
printf("\n<TOKENS> put('%c'/%d)\n",TEXT('~'),TEXT('~'));
#endif //DEBUG_TOKENS
}
if( code & PAT_PUSH )
{
if( mode!=MODE_BARRED ||
(stack[mode]!=MODE_SEMITILDE &&
stack[mode]!=MODE_SEMICOLON) )
{
*bp++ = ch;
#ifdef DEBUG_TOKENS
printf("\n<TOKENS> put('%c'/%d)\n",ch,ch);
#endif //DEBUG_TOKENS
}
}
if( code & PAT_NEXT2 )
{
source++;
len--;
}
if( code & (PAT_NEXT|PAT_NEXT2) )
{
if( *source=='\r' )
{
if( *(source+1)=='\n' )
{
source++;
len--;
}
}
source++;
len--;
}
if( code & PAT_DIRTY ) dirty = 1;
if( code & PAT_MUTATED ) mutated = 1;
if( code & PAT_ERROR_POS ) errpos = origlen-len-1;
if( code & PAT_RETURN_TO_WORD ) _stack = MODE_WORD;
if( code & PAT_RETURN_TO_SELF ) _stack = mode;
if( code & PAT_RETURN_TO_CALLER ) _stack = stack[mode];
if( code & PAT_GOTO )
{
mode = newmode;
stack[mode] = _stack;
}
if( code & PAT_RETURN ) mode = stack[mode];
if( code & PAT_TOKEN )
{
int _token = (code>>PAT_SHIFT) & 0xF;
if( _token!=TOKEN_WORD ) return _token;
if( mutated )
{
*bp = NULL_CHAR;
*token = new_word( buffer, UNKNOWN );
#ifdef DEBUG_TOKENS
printf("MUTATED TOKEN "); dumpln(*token);
printf("\n\n");
#endif
}
else
{
*token = new_subword( input, sp, source-sp );
#ifdef DEBUG_TOKENS
printf("NORMAL TOKEN **"); dump(*token);
printf("** (len=%d)\n\n\n",source-sp);
#endif
}
return dirty?TOKEN_DIRTY_WORD:TOKEN_WORD;
}
if( code & PAT_ERROR )
{
#ifdef DEBUG_TOKENS
printf("ERROR\n");
#endif
return TOKEN_ERROR;
}
#ifdef DEBUG_TOKENS
switch( mode )
{
case MODE_ENTRY: printf("ENTRY\n"); break;
case MODE_SPACE: printf("SPACE\n"); break;
case MODE_WORD: printf("WORD\n"); break;
case MODE_BARRED: printf("BARRED\n"); break;
case MODE_BACKSLASHED: printf("BACKSLASH\n"); break;
case MODE_TILDE: printf("TILDE\n"); break;
case MODE_SEMITILDE: printf("SEMITILDE\n"); break;
case MODE_SEMICOLON: printf("SEMICOLON\n"); break;
case MODE_TILDESPACE: printf("TILDESPACE\n"); break;
}
#endif
goto again;
}
atom_t get_sublist( int level, int full_parse, atom_t* pos )
{
atom_t result = empty_list;
atom_t last = empty_list;
if( pos ) *pos = NULL;
atom_t token = NULL;
atom_t sublist_pos = NULL;
int flags;
int bracketlen = len;
int pos_from = source-STRING(input);
if( pos_from ) pos_from--;
flags = 0;
while( (last_token=get_token(&token,full_parse)) )
{
sublist_pos = NULL;
#ifdef DEBUG_TOKENIZATION
switch(last_token)
{
case TOKEN_END: printf("TOKEN_END\n"); break;
case TOKEN_SPACE: printf("TOKEN_SPACE\n"); break;
case TOKEN_WORD: printf("TOKEN_WORD @"); dump(token); printf("@\n"); break;
case TOKEN_DIRTY_WORD: printf("TOKEN_|WORD| @"); dump(token); printf("@\n"); break;
case TOKEN_OPEN: printf("TOKEN_OPEN [\n"); break;
case TOKEN_CLOSE: printf("TOKEN_CLOSE ]\n"); break;
case TOKEN_LINEEND: printf("TOKEN_LINEEND\n"); break;
case TOKEN_ERROR: printf("TOKEN_ERROR\n"); break;
}
#endif
if( last_token==TOKEN_ERROR ) return result;
if( last_token==TOKEN_SPACE )
{
if( IS_NOT_EMPTY(last) ) SET_FLAGS( last, FLAG_BEFORE_SPACE );
flags |= FLAG_AFTER_SPACE;
continue;
}
if( last_token==TOKEN_DIRTY_WORD )
{
atom_t x = purify( token );
DEUSE( token );
token = x;
}
if( last_token==TOKEN_CLOSE ) break;
if( last_token==TOKEN_LINEEND )
{
flags |= FLAG_NEWLINE;
continue;
}
if( last_token==TOKEN_OPEN)
{
token = get_sublist( level+1, TOKENIZE_DATA, &sublist_pos );
if( last_token==TOKEN_ERROR )
{
DEUSE( token );
if( sublist_pos ) DEUSE(sublist_pos);
sublist_pos = NULL;
break;
}
}
if( method==TOKENIZE_COMMANDS )
flags |= FLAG_TOKENIZED_COMMANDS;
if( method==TOKENIZE_DATA )
flags |= FLAG_TOKENIZED_DATA;
if( method==TOKENIZE_COMMANDS &&
last_token==TOKEN_WORD &&
LENGTH(token)>1 &&
*STRING(token)==TEXT('?') &&
*(STRING(token)+1)>=TEXT('0') &&
*(STRING(token)+1)<=TEXT('9') )
{
atom_t new_qoken = new_subword( token, STRING(token), 1 );
atom_t new_token = new_subword( token, STRING(token)+1, LENGTH(token)-1 );
DEUSE( token );
append( new_word(TEXT("("),-1), &result, &last );
append( new_qoken, &result, &last );
append( new_token, &result, &last );
append( new_word(TEXT(")"),-1), &result, &last );
}
else
{
if( sublist_pos )
{
append_ex( token, &result, &last );
POS( last ) = sublist_pos;
sublist_pos = NULL;
}
else
{
append( token, &result, &last );
}
SET_FLAGS( last, flags );
}
flags = 0;
if( last_token==TOKEN_ERROR ) break;
}
if( level )
{
if( last_token==TOKEN_END )
{
errpos = origlen-bracketlen-1;
last_token = TOKEN_ERROR;
}
}
else
{
if( last_token==TOKEN_CLOSE )
{
errpos = origlen-len-1;
last_token = TOKEN_ERROR;
}
}
int pos_to = source-STRING(input);
if( pos_to ) pos_to--;
if( pos && method == TOKENIZE_DATA )
{
*pos = new_subword( input, STRING(input)+pos_from, pos_to-pos_from+1 );
}
return result;
}
atom_t result = get_sublist( 0, method, NULL );
if( last_token==TOKEN_ERROR )
{
DEUSE( result );
result = new_parse_error( ERROR_INCOMPLETE_PAIR, errpos, input );
}
DEALLOC( buffer );
return result;
}