|
Lhogho
0.0.027
|
Tokenization methods | |
| #define | TOKENIZE_DATA 0 |
| tokenize as data | |
| #define | TOKENIZE_COMMANDS 1 |
| tokenize as commands | |
| #define | TOKENIZE_READWORD 2 |
| tokenize as for READWORD function | |
| #define | TOKENIZE_READLIST 3 |
| tokenize as for READLIST function | |
| #define | ENBAR(ch) (((unsigned char)ch)>127 ? (ch) : enbar[(unsigned char)ch]) |
| #define | DEBAR(ch) ((ch)>31 ? (ch) : debar[(unsigned char)ch]) |
| char_t | enbar [128] |
| table for a->|a| conversions | |
| char_t | debar [32] |
| table for |a|->a conversions | |
| void | init_parser () |
| initializes parser | |
| atom_t | tokenize (atom_t input, int method) |
| tokenizes into a list | |
| atom_t | trim_shell_comment (atom_t word) |
| trims shell comment (if any) | |
| atom_t | purify (atom_t word) |
| purifies a word | |
| atom_t | build_syntax_tree (atom_t function) |
| parses body of user-defined function | |
| #define TOKENIZE_DATA 0 |
| #define TOKENIZE_COMMANDS 1 |
| #define TOKENIZE_READWORD 2 |
| #define TOKENIZE_READLIST 3 |
| void init_parser | ( | ) |
Initializes tables enbar[] and debar[] which are used to enbar and debar a character.
{
int i;
// NOTE! if the string of enbarrable characters is
// changed, also change dump_word()
chars_t s = TEXT("______________()+-*/=<>|?_______");
//locked-> x........xx..x..................
for( i=0; i<32; i++ ) debar[i]=i; // identity matrix
for( i=0; i<128; i++ ) enbar[i]=i; // identity matrix
for( i=0; *s; i++,s++ )
{
if( *s==TEXT('_') ) continue;
debar[i] = *s;
enbar[(unsigned char)*s] = i;
}
}
| input | word, subword or list to be tokenized |
| method | method of tokenization |
Tokenizes a word, a subword or a list into a list. If possible makes all words as subwords. Backslashes and bars in words are preserved (i.e. words are not purified). Comments and line continuations are ignored.
If the method is TOKENIZE_DATA then the input is tokenized as if it contains Logo data. If the method is TOKENIZE_COMMANDS then the input is tokenized as if it contains Logo commands. If the method is TOKENIZE_READWORD then the input is tokenized as expected by READWORD function.
If the input is a list then all its elements are tokenized one-by-one.
Return value is the tokenized list. In there is an error, returns an error atom which error code is ERROR_INCOMPLETE_PAIR, error position points the position in the word (0-based) and the error source is the word itself.
{
if( IS_LIST(input) )
{
if( method==TOKENIZE_DATA &&
GET_FLAGS( input, FLAG_TOKENIZED_DATA|FLAG_TOKENIZED_COMMANDS ) )
return USE(input);
if( method==TOKENIZE_COMMANDS &&
GET_FLAGS( input, FLAG_TOKENIZED_COMMANDS ) )
return USE(input);
}
if( IS_FLOAT(input) )
{
return USE(input);
}
//printf("\n");
//if(method==TOKENIZE_DATA)
// printf("ENTER TOKENIZE_DATA(");
//else
// printf("ENTER TOKENIZE_COMMANDS(");
//dump_atom(input,1);
//printf(")\n");
#ifdef SAFEMODE
assert( IS_WORD(input) || IS_SUBWORD(input) || IS_LIST(input) );
#endif
// First check whether the input is a list.
// If it is then tokenize recursively all its elements.
if( IS_LIST(input) )
{
atom_t result = empty_list;
atom_t last = empty_list;
atom_t x;
atom_t y;
for( x=input; IS_NOT_EMPTY(x); x=CDR(x) )
{
//printf("@@@@@@@@PROCESSING=|"); dump_atom(CAR(x),1); printf("|\n");
int submethod = IS_LIST(CAR(x))?TOKENIZE_DATA:method;
atom_t element = tokenize( CAR(x), submethod );
//printf("@@@@@@@@TOKENIZED_INTO=|"); dump_atom(element,1); printf("|\n");
if( IS_ERROR(element) )
{
DEUSE( result );
//DEUSE( last );
result = element;
break;
}
int initial_flags = GET_FLAGS( x, FLAG_NEWLINE|FLAG_AFTER_SPACE );
int final_flags = GET_FLAGS( x, FLAG_BEFORE_SPACE );
//printf(">>>CAR(x) = "); dump_atom(CAR(x),1); printf("\n");
//printf(">>>element = "); dump_atom(element,1); printf("\n");
if( IS_FLOAT(element) )
{
append( USE(element), &result, &last );
SET_FLAGS( last, initial_flags|final_flags );
}
else if( IS_LIST(CAR(x)) )
{
if( IS_EXTENDED(x) )
{
append_ex( USE(element), &result, &last );
DEUSE( POS(last) );
POS( last ) = USE( POS(x) );
}
else
append( USE(element), &result, &last );
SET_FLAGS( last, initial_flags|final_flags );
}
else
{
for( y=element; IS_NOT_EMPTY(y); y=CDR(y) )
{
//printf("APPEND SUBELEMENT |");
//dump_atom(CAR(y),1);
//printf("|\n");
if( IS_EXTENDED(y) )
{
append_ex( USE(CAR(y)), &result, &last );
DEUSE( POS(last) );
POS( last ) = USE( POS(y) );
}
else
append( USE(CAR(y)), &result, &last );
SET_FLAGS( last, FLAGS(y) );
if( y==element ) SET_FLAGS( last, initial_flags );
}
#ifdef SAFE_MODE
assert( IS_NOT_EMPTY(last) );
#endif
SET_FLAGS( last, final_flags);
}
DEUSE( element );
}
//printf("FINAL RESULT IS |"); dump_atom(result,1); printf("|\n");
return result;
}
// The input is a word or a subword
chars_t source = STRING(input);
int len = LENGTH(input);
int origlen = len;
chars_t buffer = ALLOC( CHAR_SIZE*len ); // buffer for the longest word
//chars_t bp = buffer;
char_t ch;
int_t errpos = -1;
//char_t errchar = NULL_CHAR;
int last_token = TOKEN_LINEEND;
//int crlf = 0;
// Gets the next token. Return:
// TOKEN_END if there are no more tokens
// TOKEN_WORD if the token is a word
// TOKEN_DIRTY_WORD if the token is a word with \ or |
// TOKEN_OPEN if the token is [
// TOKEN_CLOSE if the token is ]
// TOKEN_LINEEND if the token is <nl>
// TOKEN_SPACE if at least one whitespace is met
//int co=0;
int get_token( atom_t *token, int method )
{
//co++;
//if (0 == co%1024)
//{
//printf("%d ",co);
//}
// return 1 if buffer contains number
int is_number(chars_t bp)
{ // "E" {digit}* "." {digit}+
// 1 2 3 4
chars_t cp = bp;
int num_mode = 1;
char_t ch;
cp = bp;
if( bp==buffer ) return 0;
while( cp>buffer )
{
cp--;
ch = *cp;
//printf("num_mode=%d ch=%C\n",num_mode,ch);
switch( num_mode )
{
case 1:
if( ch!=TEXT('E') && ch!=TEXT('e') ) return 0;
num_mode = 2;
break;
case 2: ;
if( ch<TEXT('0') || ch>TEXT('9') ) return 0;
num_mode = 3;
break;
case 3: ;
if( ch!=TEXT('.') ) return 0;
num_mode = 4;
break;
case 4: ;
if( ch<TEXT('0') || ch>TEXT('9') ) return 0;
break;
}
//printf("num_mode=%d\n",num_mode);
}
return 1;
}
if( !len ) return TOKEN_END;
int dirty = 0;
int mutated = 0;
chars_t bp = buffer; *bp=NULL_CHAR;
chars_t sp = source;
int mode = MODE_ENTRY; // current mode
int code; // action code
int stack[MAX_MODE]; // return-to-mode for each mode
static int mode_eof[MAX_MODE] =
{
/* entry */ PAT_TOKEN_END,
/* whitespace */ PAT_TOKEN_SPACE,
/* word */ PAT_TOKEN_WORD,
/* barred */ PAT_ERROR,
/* backslashed */ PAT_ERROR,
/* tilde */ PAT_ERROR,
/* semitilde */ PAT_ERROR,
/* semicolon */ PAT_RETURN,
/* tildespace */ PAT_ERROR,
/* less */ PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
static int mode_eol[MAX_MODE] =
{
/* entry */ PAT_NEXT+PAT_TOKEN_LINE,
/* whitespace */ PAT_TOKEN_SPACE,
/* word */ PAT_TOKEN_WORD,
/* barred */ PAT_PUSH+PAT_NEXT,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_NEXT+PAT_RETURN,
/* semitilde */ PAT_NEXT+PAT_RETURN,
/* semicolon */ PAT_RETURN,
/* tildespace */ PAT_NEXT+PAT_RETURN,
/* less */ PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
static int mode_space[MAX_MODE] =
{
/* entry */ PAT_NEXT+PAT_GOTO_SPACE,
/* whitespace */ PAT_NEXT,
/* word */ PAT_TOKEN_WORD,
/* barred */ PAT_PUSH+PAT_NEXT,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_NEXT+PAT_GOTO_TILDESPACE,
/* semitilde */ PAT_NEXT,
/* semicolon */ PAT_NEXT,
/* tildespace */ PAT_NEXT,
/* less */ PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
static int mode_open[MAX_MODE] =
{
/* entry */ PAT_NEXT+PAT_TOKEN_OPEN,
/* whitespace */ PAT_TOKEN_SPACE,
/* word */ PAT_TOKEN_WORD,
/* barred */ PAT_PUSH+PAT_NEXT,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_TILDE+PAT_RETURN,
/* semitilde */ PAT_RETURN,
/* semicolon */ PAT_NEXT,
/* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
/* less */ PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
static int mode_close[MAX_MODE] =
{
/* entry */ PAT_NEXT+PAT_TOKEN_CLOSE,
/* whitespace */ PAT_TOKEN_SPACE,
/* word */ PAT_TOKEN_WORD,
/* barred */ PAT_PUSH+PAT_NEXT,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_TILDE+PAT_RETURN,
/* semitilde */ PAT_RETURN,
/* semicolon */ PAT_NEXT,
/* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
/* less */ PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
static int mode_bar[MAX_MODE] =
{
/* entry */ PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_RETURN_TO_WORD+PAT_GOTO_BARRED+PAT_ERROR_POS,
/* whitespace */ PAT_TOKEN_SPACE,
/* word */ PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_GOTO_BARRED+PAT_RETURN_TO_SELF+PAT_ERROR_POS,
/* barred */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_TILDE+PAT_RETURN,
/* semitilde */ PAT_RETURN,
/* semicolon */ PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_GOTO_BARRED+PAT_RETURN_TO_SELF+PAT_ERROR_POS,
/* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
/* less */ PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
static int mode_backslash[MAX_MODE] =
{
/* entry */ PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_RETURN_TO_WORD+PAT_GOTO_BACKSLASHED+PAT_ERROR_POS,
/* whitespace */ PAT_TOKEN_SPACE,
/* word */ PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_RETURN_TO_SELF+PAT_GOTO_BACKSLASHED+PAT_ERROR_POS,
/* barred */ PAT_PUSH+PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_BACKSLASHED,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_TILDE+PAT_RETURN,
/* semitilde */ PAT_RETURN,
/* semicolon */ PAT_NEXT2, //+PAT_RETURN_TO_SELF+PAT_GOTO_BACKSLASHED,
/* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
/* less */ PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
static int mode_tilde[MAX_MODE] =
{
/* entry */ PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_TILDE+PAT_ERROR_POS,
/* whitespace */ PAT_TOKEN_SPACE,
/* word */ PAT_NEXT+PAT_MUTATED+PAT_RETURN_TO_SELF+PAT_GOTO_TILDE+PAT_ERROR_POS,
/* barred */ PAT_PUSH+PAT_NEXT,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_TILDE+PAT_RETURN,
/* semitilde */ PAT_RETURN,
/* semicolon */ PAT_NEXT+PAT_MUTATED+PAT_RETURN_TO_CALLER+PAT_GOTO_SEMITILDE+PAT_ERROR_POS,
/* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
/* less */ PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
static int mode_semicolon[MAX_MODE] =
{
/* entry */ PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_SEMICOLON,
/* whitespace */ PAT_TOKEN_SPACE,
/* word */ PAT_NEXT+PAT_MUTATED+PAT_RETURN_TO_SELF+PAT_GOTO_SEMICOLON,
/* barred */ PAT_PUSH+PAT_NEXT,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_TILDE+PAT_RETURN,
/* semitilde */ PAT_RETURN,
/* semicolon */ PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_SEMITILDE,
/* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
/* less */ PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
static int mode_else[MAX_MODE] =
{
/* entry */ PAT_PUSH+PAT_NEXT+PAT_GOTO_WORD,
/* whitespace */ PAT_TOKEN_SPACE,
/* word */ PAT_PUSH+PAT_NEXT,
/* barred */ PAT_PUSH+PAT_NEXT,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_TILDE+PAT_RETURN,
/* semitilde */ PAT_RETURN,
/* semicolon */ PAT_NEXT,
/* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
/* less */ PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
static int mode_parens[MAX_MODE] =
{
/* entry */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
/* whitespace */ PAT_TOKEN_SPACE,
/* word */ PAT_TOKEN_WORD,
/* barred */ PAT_PUSH+PAT_NEXT,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_TILDE+PAT_RETURN,
/* semitilde */ PAT_RETURN,
/* semicolon */ PAT_NEXT,
/* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
/* less */ PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
static int mode_equal[MAX_MODE] =
{
/* entry */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
/* whitespace */ PAT_TOKEN_SPACE,
/* word */ PAT_TOKEN_WORD,
/* barred */ PAT_PUSH+PAT_NEXT,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_TILDE+PAT_RETURN,
/* semitilde */ PAT_RETURN,
/* semicolon */ PAT_NEXT,
/* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
/* less */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
/* greater */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD
};
static int mode_less[MAX_MODE] =
{
/* entry */ PAT_PUSH+PAT_NEXT+PAT_GOTO_LESS,
/* whitespace */ PAT_PUSH+PAT_NEXT+PAT_GOTO_LESS, //PAT_TOKEN_WORD, @boza
/* word */ PAT_TOKEN_WORD,
/* barred */ PAT_PUSH+PAT_NEXT,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_TILDE+PAT_RETURN,
/* semitilde */ PAT_RETURN,
/* semicolon */ PAT_NEXT,
/* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
/* less */ PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
static int mode_greater[MAX_MODE] =
{
/* entry */ PAT_PUSH+PAT_NEXT+PAT_GOTO_GREATER,
/* whitespace */ PAT_PUSH+PAT_NEXT+PAT_GOTO_GREATER, //PAT_TOKEN_WORD, @boza
/* word */ PAT_TOKEN_WORD,
/* barred */ PAT_PUSH+PAT_NEXT,
/* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
/* tilde */ PAT_TILDE+PAT_RETURN,
/* semitilde */ PAT_RETURN,
/* semicolon */ PAT_NEXT,
/* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
/* less */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
/* greater */ PAT_TOKEN_WORD
};
again:
// get action code
GET_CHAR;
#ifdef DEBUG_TOKENS
printf("<TOKENS> length=%d\n",len);
if(ch<TEXT(' '))
printf("<TOKENS> get(#%d)\t",ch);
else
printf("<TOKENS> get('%c')\t",ch);
switch( mode )
{
case MODE_ENTRY: printf(" ENTRY -> "); break;
case MODE_SPACE: printf(" SPACE -> "); break;
case MODE_WORD: printf(" WORD -> "); break;
case MODE_BARRED: printf(" BARRED -> "); break;
case MODE_BACKSLASHED: printf(" BACKSLASH -> "); break;
case MODE_TILDE: printf(" TILDE -> "); break;
case MODE_SEMITILDE: printf(" SEMITILDE -> "); break;
case MODE_SEMICOLON: printf(" SEMICOLON -> "); break;
case MODE_TILDESPACE: printf("TILDESPACE -> "); break;
}
#endif
code = mode_else[mode];
if( !len )
code = mode_eof[mode];
else
{
if( method==TOKENIZE_READWORD )
{ // tokenize as expected by READWORD
if( ch==TEXT('|') ) code = mode_bar[mode];
else if( ch==TEXT('\\') ) code = mode_backslash[mode];
}
else
if( method==TOKENIZE_READLIST )
{ // tokenize as data
if( ch==TEXT('\n') ) code = mode_eol[mode];
else if( ch==TEXT('\r') ) code = mode_eol[mode];
else if( ch<=TEXT(' ') ) code = mode_space[mode];
else if( ch==TEXT('[') ) code = mode_open[mode];
else if( ch==TEXT(']') ) code = mode_close[mode];
else if( ch==TEXT('|') ) code = mode_bar[mode];
else if( ch==TEXT('\\') ) code = mode_backslash[mode];
else if( ch==TEXT('~') ) code = mode_tilde[mode];
}
else
if( method==TOKENIZE_DATA )
{ // tokenize as data
if( ch==TEXT('\n') ) code = mode_eol[mode];
else if( ch==TEXT('\r') ) code = mode_eol[mode];
else if( ch<=TEXT(' ') ) code = mode_space[mode];
else if( ch==TEXT('[') ) code = mode_open[mode];
else if( ch==TEXT(']') ) code = mode_close[mode];
else if( ch==TEXT('|') ) code = mode_bar[mode];
else if( ch==TEXT('\\') ) code = mode_backslash[mode];
else if( ch==TEXT('~') ) code = mode_tilde[mode];
else if( ch==TEXT(';') ) code = mode_semicolon[mode];
}
else
{ // tokenize as commands
//printf("ch=%c *buf=%c\n",ch,*buffer);
if( ch==TEXT('(') ||
ch==TEXT(')') ) code = mode_parens[mode];
else if( *buffer!=TEXT('"') )
{
if( (ch==TEXT('+') || ch==TEXT('-')) && is_number(bp) ) { }
else if( ch==TEXT('+') ||
ch==TEXT('-') ||
ch==TEXT('*') ||
ch==TEXT('/')) code = mode_parens[mode];
else if( ch==TEXT('=') ) code = mode_equal[mode];
else if( ch==TEXT('<') ) code = mode_less[mode];
else if( ch==TEXT('>') ) code = mode_greater[mode];
}
}
}
// process action code
int _stack = -1;
int newmode = 0;
if( code & PAT_GOTO )
{
// this is pred-processing of GOTO
// if old mode was space, and new is not,
// then update initial position of next token
newmode = (code>>PAT_SHIFT) & 0xF;
if( mode==MODE_SPACE && newmode!=MODE_SPACE ) { sp = source; }
}
if( code & PAT_TILDE )
{
*bp++ = TEXT('~');
#ifdef DEBUG_TOKENS
printf("\n<TOKENS> put('%c'/%d)\n",TEXT('~'),TEXT('~'));
#endif //DEBUG_TOKENS
}
if( code & PAT_PUSH )
{
// push a character only if:
// - currently not in bars |..?..|
// - currently in bars, but not in semicolon ;..|..?..|
if( mode!=MODE_BARRED ||
(stack[mode]!=MODE_SEMITILDE &&
stack[mode]!=MODE_SEMICOLON) )
{
//if( mode==MODE_BARRED || mode==MODE_BACKSLASHED )
//*bp++ = ENBAR(ch);
//else
*bp++ = ch;
#ifdef DEBUG_TOKENS
printf("\n<TOKENS> put('%c'/%d)\n",ch,ch);
#endif //DEBUG_TOKENS
//if( ch=='\r' && *(source+1)=='\n' ) // handle CRLF cases
//{
//*bp++ = '\n';
//#ifdef DEBUG_TOKENS
// printf("\n<TOKENS> put('%d')\n",'\n');
// #endif //DEBUG_TOKENS
//}
}
}
if( code & PAT_NEXT2 )
{
source++;
len--;
}
if( code & (PAT_NEXT|PAT_NEXT2) )
{
if( *source=='\r' )
{
//crlf = 0;
if( *(source+1)=='\n' )
{
//crlf = 1;
source++;
len--;
}
}
source++;
len--;
}
if( code & PAT_DIRTY ) dirty = 1;
if( code & PAT_MUTATED ) mutated = 1;
if( code & PAT_ERROR_POS ) errpos = origlen-len-1;
if( code & PAT_RETURN_TO_WORD ) _stack = MODE_WORD;
if( code & PAT_RETURN_TO_SELF ) _stack = mode;
if( code & PAT_RETURN_TO_CALLER ) _stack = stack[mode];
if( code & PAT_GOTO )
{
// this is post-processing of GOTO
mode = newmode;
stack[mode] = _stack;
}
if( code & PAT_RETURN ) mode = stack[mode];
if( code & PAT_TOKEN )
{
int _token = (code>>PAT_SHIFT) & 0xF;
if( _token!=TOKEN_WORD ) return _token;
if( mutated )
{
*bp = NULL_CHAR;
*token = new_word( buffer, UNKNOWN );
#ifdef DEBUG_TOKENS
printf("MUTATED TOKEN "); dumpln(*token);
printf("\n\n");
#endif
}
else
{
*token = new_subword( input, sp, source-sp /*bp-buffer*/ );
#ifdef DEBUG_TOKENS
printf("NORMAL TOKEN **"); dump(*token);
printf("** (len=%d)\n\n\n",source-sp);
#endif
}
return dirty?TOKEN_DIRTY_WORD:TOKEN_WORD;
}
if( code & PAT_ERROR )
{
#ifdef DEBUG_TOKENS
printf("ERROR\n");
#endif
return TOKEN_ERROR;
}
#ifdef DEBUG_TOKENS
switch( mode )
{
case MODE_ENTRY: printf("ENTRY\n"); break;
case MODE_SPACE: printf("SPACE\n"); break;
case MODE_WORD: printf("WORD\n"); break;
case MODE_BARRED: printf("BARRED\n"); break;
case MODE_BACKSLASHED: printf("BACKSLASH\n"); break;
case MODE_TILDE: printf("TILDE\n"); break;
case MODE_SEMITILDE: printf("SEMITILDE\n"); break;
case MODE_SEMICOLON: printf("SEMICOLON\n"); break;
case MODE_TILDESPACE: printf("TILDESPACE\n"); break;
}
#endif
goto again;
} // get_token()
atom_t get_sublist( int level, int full_parse, atom_t* pos ) //ex2//
{
atom_t result = empty_list;
atom_t last = empty_list;
if( pos ) *pos = NULL; //ex2//
atom_t token = NULL;
atom_t sublist_pos = NULL; //ex2//
int flags;
int bracketlen = len; // LEN of last opening bar
int pos_from = source-STRING(input); //ex2//
if( pos_from ) pos_from--; //ex2//
flags = 0;//FLAG_NEWLINE;
while( (last_token=get_token(&token,full_parse)) )
{
sublist_pos = NULL;
#ifdef DEBUG_TOKENIZATION
switch(last_token)
{
case TOKEN_END: printf("TOKEN_END\n"); break;
case TOKEN_SPACE: printf("TOKEN_SPACE\n"); break;
case TOKEN_WORD: printf("TOKEN_WORD @"); dump(token); printf("@\n"); break;
case TOKEN_DIRTY_WORD: printf("TOKEN_|WORD| @"); dump(token); printf("@\n"); break;
case TOKEN_OPEN: printf("TOKEN_OPEN [\n"); break;
case TOKEN_CLOSE: printf("TOKEN_CLOSE ]\n"); break;
case TOKEN_LINEEND: printf("TOKEN_LINEEND\n"); break;
case TOKEN_ERROR: printf("TOKEN_ERROR\n"); break;
}
#endif
if( last_token==TOKEN_ERROR ) return result;
if( last_token==TOKEN_SPACE )
{
if( IS_NOT_EMPTY(last) ) SET_FLAGS( last, FLAG_BEFORE_SPACE );
flags |= FLAG_AFTER_SPACE;
continue;
}
if( last_token==TOKEN_DIRTY_WORD )
{
//printf("###BEFORE="); dumpln(token);
atom_t x = purify( token );
DEUSE( token );
token = x;
//printf("###AFTER="); dumpln(token);
}
if( last_token==TOKEN_CLOSE ) break;
if( last_token==TOKEN_LINEEND )
{
flags |= FLAG_NEWLINE;
continue;
}
if( last_token==TOKEN_OPEN)
{
token = get_sublist( level+1, TOKENIZE_DATA, &sublist_pos ); // recursive //ex2//
if( last_token==TOKEN_ERROR )
{
DEUSE( token );
if( sublist_pos ) DEUSE(sublist_pos);
sublist_pos = NULL;
break;
}
}
if( method==TOKENIZE_COMMANDS )
flags |= FLAG_TOKENIZED_COMMANDS;
if( method==TOKENIZE_DATA )
flags |= FLAG_TOKENIZED_DATA;
if( method==TOKENIZE_COMMANDS &&
last_token==TOKEN_WORD &&
LENGTH(token)>1 &&
*STRING(token)==TEXT('?') &&
*(STRING(token)+1)>=TEXT('0') &&
*(STRING(token)+1)<=TEXT('9') )
{
//printf(">>>%d %d\n", last_token==TOKEN_WORD, last_token==TOKEN_DIRTY_WORD);
// process template ?nn->(? nn) for command tokenization
//printf("append token **"); dump(token); printf("**\n");
atom_t new_qoken = new_subword( token, STRING(token), 1 );
atom_t new_token = new_subword( token, STRING(token)+1, LENGTH(token)-1 );
DEUSE( token );
append( new_word(TEXT("("),-1), &result, &last ); // (
append( new_qoken, &result, &last ); // ?
append( new_token, &result, &last ); // nn
append( new_word(TEXT(")"),-1), &result, &last ); // )
}
else
{
// normal token, no more processing needed
if( sublist_pos ) //ex//
{
append_ex( token, &result, &last );
POS( last ) = sublist_pos;
//printf("\n\nSET EXTENDED POSITION ");
//dump_atom(sublist_pos,1); printf("\n");
//printf("CURRENT RESULT ");
//dump_atom(result,1); printf("\n\n");
sublist_pos = NULL;
}
else
{
append( token, &result, &last );
}
SET_FLAGS( last, flags );
}
flags = 0;
if( last_token==TOKEN_ERROR ) break;
}
// test for unmatching square brackets
// i.e. ...[... or ...]...
if( level )
{
if( last_token==TOKEN_END )
{
errpos = origlen-bracketlen-1;
last_token = TOKEN_ERROR;
}
}
else
{
if( last_token==TOKEN_CLOSE )
{
errpos = origlen-len-1;
last_token = TOKEN_ERROR;
}
}
int pos_to = source-STRING(input);
if( pos_to ) pos_to--;
if( pos && method == TOKENIZE_DATA ) //ex2//
{
*pos = new_subword( input, STRING(input)+pos_from, pos_to-pos_from+1 );
}
return result;
} // get_sublist()
atom_t result = get_sublist( 0, method, NULL ); //ex//
// in case of error return empty list
if( last_token==TOKEN_ERROR )
{
//printf("ERROR RESULT=");dumpln(result);
DEUSE( result );
result = new_parse_error( ERROR_INCOMPLETE_PAIR, errpos, input );
}
DEALLOC( buffer );
//printf("#########");
//dump_atom(result,1);
//printf("######\n");
return result;
}
| atom_t trim_shell_comment | ( | atom_t | word | ) |
| word | word containing source text |
Trims a shell comment from the beginning of the word. Shell comment can be only the first line if its first two characters are #!. If a shell comment is trimmed, then the result is a subword from the first character on the second line, otherwise the input word is returned as is but with increased reference count.
{
#ifdef SAFEMODE
assert( IS_WORD(word) || IS_SUBWORD(word) );
#endif
chars_t source = STRING(word);
int_t len = LENGTH(word);
// if there are no enough characters just exit
if( LENGTH(word)<2 ) return USE(word);
// if the first two characters are not #! then exit
if( *source!=TEXT('#') || *(source+1)!=TEXT('!') ) return USE(word);
// skip the line
while( len && *source!=TEXT('\n') )
{
source++;
len--;
}
// return a subword. Pay attention to always reference
// the main host word because the input could be a word
// or a subword.
if( IS_WORD(word) )
return new_subword( word, source, len );
else
return new_subword( WORD(word), source, len );
}
| word | word to be purified |
Purifies a word by processing all backslashes and bars. Returns a new word if needed. Assumes that the input needs purification.
{
//return USE(word);
#ifdef SAFEMODE
assert( IS_WORD(word) || IS_SUBWORD(word) );
#endif
chars_t source = STRING(word);
int_t len = LENGTH(word);
chars_t buffer = alloca( CHAR_SIZE*len ); // buffer for the longest word
chars_t bp = buffer;
int need_enbar = 0;
int is_mutated = 0; // set to 1 if the word is mutated
int in_backslash = 0;
int in_bars = 0;
for( ; len; len--,source++ )
{
need_enbar = in_bars || in_backslash;
if( in_backslash )
{
in_backslash = 0;
}
else if( *source==TEXT('\\') )
{
is_mutated = 1;
in_backslash = 1;
continue;
}
else if( *source==TEXT('|') )
{
is_mutated = 1;
in_bars = !in_bars;
continue;
}
if( need_enbar )
*bp++ = ENBAR(*source);
else
*bp++ = *source;
//if( need_enbar )
//printf(" PURIFY %d %d\n",*source,ENBAR(*source));
//else
//printf(" PURIFY %d \n",*source );
}
*bp = NULL_CHAR;
if( is_mutated )
return new_word( buffer, bp-buffer );
else
return USE(word);
}
| atom_t build_syntax_tree | ( | atom_t | func | ) |
| func | var atom for the parse context |
Parses completely a function. Its source is stored in its body as word, subword, data-tokenized list or command-tokenized list. Building algorithm:
TO ... ENDs and create them as subfunctions{
// exit is function is already treefied
if( IS_NOT_EMPTY(TREE(func)) ) return empty_list;
//printf("BUILD_SYNTAX_TREE(FUNC=");
//dump(NAME(func));
//printf(",SOURCE=");
//dump(SOURCE(func));
//printf(",BODY=");
//dump(BODY(func));
//printf(",LOCALS=");
//dump(LOCALS(func));
//printf(")\n\n");
if( IS_EMPTY(BODY(func)) )
{
// Step 1. Tokenize
//printf("SOURCE="); dumpln(SOURCE(func));
atom_t tokens1 = tokenize( SOURCE(func), TOKENIZE_DATA );
if( IS_ERROR(tokens1) ) return tokens1;
//printf("TOKENS1="); dumpln(tokens1);
atom_t tokens2 = tokenize( tokens1, TOKENIZE_COMMANDS );
DEUSE( tokens1 );
if( IS_ERROR(tokens2) ) return tokens2;
//printf("TOKENS2="); dumpln(tokens2);
// Step 2. Extract TO..END's
atom_t body = preparse( tokens2, func, LEVEL(func) );
if( IS_ERROR(body) ) return body;
DEUSE( BODY(func) );
BODY(func) = body;
//printf("BODY="); dumpln(BODY(func));
}
// Step 3. Parse function body
//printf("BODY="); dumpln(BODY(func));
atom_t tree = parse( BODY(func), func, 1 );
if( IS_ERROR(tree) ) return tree;
DEUSE(TREE(func));
TREE(func) = tree;
//printf("TREE="); dumpln(TREE(func));
// because the might be some new TO..ENDs
// scan all locals and build those which
// have no trees
atom_t local;
atom_t locals;
for( locals=LOCALS(func); IS_NOT_EMPTY(locals); locals=CDR(locals) )
{
local = CAR(locals);
if( !DESCR2(local) ) continue;
atom_t x = build_syntax_tree( local );
if( IS_ERROR(x) ) return x;
}
return empty_list;
}