Lhogho  0.0.027
parser.h File Reference

Tokenization methods

#define TOKENIZE_DATA   0
 tokenize as data
#define TOKENIZE_COMMANDS   1
 tokenize as commands
#define TOKENIZE_READWORD   2
 tokenize as for READWORD function
#define TOKENIZE_READLIST   3
 tokenize as for READLIST function
#define ENBAR(ch)   (((unsigned char)ch)>127 ? (ch) : enbar[(unsigned char)ch])
#define DEBAR(ch)   ((ch)>31 ? (ch) : debar[(unsigned char)ch])
char_t enbar [128]
 table for a->|a| conversions
char_t debar [32]
 table for |a|->a conversions
void init_parser ()
 initializes parser
atom_t tokenize (atom_t input, int method)
 tokenizes into a list
atom_t trim_shell_comment (atom_t word)
 trims shell comment (if any)
atom_t purify (atom_t word)
 purifies a word
atom_t build_syntax_tree (atom_t function)
 parses body of user-defined function

Define Documentation

#define TOKENIZE_DATA   0
#define TOKENIZE_COMMANDS   1
#define TOKENIZE_READWORD   2
#define TOKENIZE_READLIST   3
#define ENBAR (   ch)    (((unsigned char)ch)>127 ? (ch) : enbar[(unsigned char)ch])
#define DEBAR (   ch)    ((ch)>31 ? (ch) : debar[(unsigned char)ch])

Function Documentation

Initializes tables enbar[] and debar[] which are used to enbar and debar a character.

{
  int i;

  // NOTE! if the string of enbarrable characters is
  // changed, also change dump_word()
  chars_t s = TEXT("______________()+-*/=<>|?_______");
  //locked->        x........xx..x..................

  for( i=0; i<32;  i++ ) debar[i]=i;   // identity matrix
  for( i=0; i<128; i++ ) enbar[i]=i;   // identity matrix

  for( i=0; *s; i++,s++ )
  {
    if( *s==TEXT('_') ) continue;
    debar[i] = *s;
    enbar[(unsigned char)*s] = i;
  }
}
atom_t tokenize ( atom_t  input,
int  method 
)
Parameters:
inputword, subword or list to be tokenized
methodmethod of tokenization
Returns:
tokenized list

Tokenizes a word, a subword or a list into a list. If possible makes all words as subwords. Backslashes and bars in words are preserved (i.e. words are not purified). Comments and line continuations are ignored.

If the method is TOKENIZE_DATA then the input is tokenized as if it contains Logo data. If the method is TOKENIZE_COMMANDS then the input is tokenized as if it contains Logo commands. If the method is TOKENIZE_READWORD then the input is tokenized as expected by READWORD function.

If the input is a list then all its elements are tokenized one-by-one.

Return value is the tokenized list. In there is an error, returns an error atom which error code is ERROR_INCOMPLETE_PAIR, error position points the position in the word (0-based) and the error source is the word itself.

{
  if( IS_LIST(input) )
  {
    if( method==TOKENIZE_DATA && 
        GET_FLAGS( input, FLAG_TOKENIZED_DATA|FLAG_TOKENIZED_COMMANDS ) )
    return USE(input);
    if( method==TOKENIZE_COMMANDS && 
        GET_FLAGS( input, FLAG_TOKENIZED_COMMANDS ) )
    return USE(input);
  }

  if( IS_FLOAT(input) )
  {
    return USE(input);
  }

  //printf("\n");
  //if(method==TOKENIZE_DATA)
  //  printf("ENTER TOKENIZE_DATA(");
  //else
  //  printf("ENTER TOKENIZE_COMMANDS(");
  //dump_atom(input,1);
  //printf(")\n");


  #ifdef SAFEMODE
  assert( IS_WORD(input) || IS_SUBWORD(input) || IS_LIST(input) );
  #endif

  // First check whether the input is a list.
  // If it is then tokenize recursively all its elements.
  if( IS_LIST(input) )
  { 
    atom_t result = empty_list;
    atom_t last   = empty_list;
    atom_t x;
    atom_t y;
    for( x=input; IS_NOT_EMPTY(x); x=CDR(x) )
    {
      //printf("@@@@@@@@PROCESSING=|"); dump_atom(CAR(x),1); printf("|\n");

      int submethod = IS_LIST(CAR(x))?TOKENIZE_DATA:method;
      atom_t element = tokenize( CAR(x), submethod );
      //printf("@@@@@@@@TOKENIZED_INTO=|"); dump_atom(element,1); printf("|\n");

      if( IS_ERROR(element) )
      {
        DEUSE( result );
        //DEUSE( last );
        result = element;
        break;
      }

      int initial_flags = GET_FLAGS( x, FLAG_NEWLINE|FLAG_AFTER_SPACE );
      int final_flags   = GET_FLAGS( x, FLAG_BEFORE_SPACE );

      //printf(">>>CAR(x)  = "); dump_atom(CAR(x),1); printf("\n");
      //printf(">>>element = "); dump_atom(element,1); printf("\n");
      if( IS_FLOAT(element) )
      {
        append( USE(element), &result, &last );
        SET_FLAGS( last, initial_flags|final_flags );
      }
      else if( IS_LIST(CAR(x)) )
      {
        if( IS_EXTENDED(x) )
        {
          append_ex( USE(element), &result, &last );
          DEUSE( POS(last) );
          POS( last ) = USE( POS(x) );
        }
        else
        append( USE(element), &result, &last );
        SET_FLAGS( last, initial_flags|final_flags );
      }
      else
      {
        for( y=element; IS_NOT_EMPTY(y); y=CDR(y) )
        {
          //printf("APPEND SUBELEMENT |");
          //dump_atom(CAR(y),1);
          //printf("|\n");

          if( IS_EXTENDED(y) )
          {
            append_ex( USE(CAR(y)), &result, &last );
            DEUSE( POS(last) );
            POS( last ) = USE( POS(y) );
          }
          else
          append( USE(CAR(y)), &result, &last );
          SET_FLAGS( last, FLAGS(y) );
          if( y==element ) SET_FLAGS( last, initial_flags );
        }
        #ifdef SAFE_MODE
        assert( IS_NOT_EMPTY(last) );
        #endif
        SET_FLAGS( last, final_flags);
      }

      DEUSE( element );
    }

    //printf("FINAL RESULT IS |"); dump_atom(result,1); printf("|\n");
    return result;
  }

  // The input is a word or a subword
  chars_t source  = STRING(input);
  int     len     = LENGTH(input);
  int     origlen = len;

  chars_t buffer = ALLOC( CHAR_SIZE*len ); // buffer for the longest word
  //chars_t bp     = buffer;
  char_t  ch;

  int_t   errpos    = -1;
  //char_t  errchar   = NULL_CHAR;

  int last_token = TOKEN_LINEEND;
  //int crlf = 0;

  // Gets the next token. Return:
  //  TOKEN_END    if there are no more tokens
  //  TOKEN_WORD   if the token is a word
  //  TOKEN_DIRTY_WORD if the token is a word with \ or |
  //  TOKEN_OPEN   if the token is [
  //  TOKEN_CLOSE  if the token is ]
  //  TOKEN_LINEEND   if the token is <nl>
  //    TOKEN_SPACE   if at least one whitespace is met


  //int co=0;
  int get_token( atom_t *token, int method )
  {
    //co++;
    //if (0 == co%1024)
    //{
    //printf("%d ",co);
    //}
    // return 1 if buffer contains number
    int is_number(chars_t bp)
    { // "E" {digit}* "." {digit}+
      //  1     2      3     4
      chars_t cp = bp;
      int num_mode = 1;
      char_t ch;

      cp = bp;

      if( bp==buffer ) return 0;

      while( cp>buffer )
      {
        cp--;
        ch = *cp;
        //printf("num_mode=%d ch=%C\n",num_mode,ch);
        switch( num_mode )
        {
        case 1:
          if( ch!=TEXT('E') && ch!=TEXT('e') ) return 0;
          num_mode = 2;
          break;
        case 2: ;
          if( ch<TEXT('0') || ch>TEXT('9') ) return 0;
          num_mode = 3;
          break;
        case 3: ;
          if( ch!=TEXT('.') ) return 0;
          num_mode = 4;
          break;
        case 4: ;
          if( ch<TEXT('0') || ch>TEXT('9') ) return 0;
          break;
        }
        //printf("num_mode=%d\n",num_mode);
      }
      return 1;
    }

    if( !len ) return TOKEN_END;

    int dirty = 0;
    int mutated = 0;
    chars_t bp = buffer; *bp=NULL_CHAR;
    chars_t sp = source;

    int mode = MODE_ENTRY; // current mode
    int code;        // action code


    int stack[MAX_MODE];   // return-to-mode for each mode
    static int mode_eof[MAX_MODE] =
    {
      /* entry */ PAT_TOKEN_END,
      /* whitespace  */ PAT_TOKEN_SPACE,
      /* word     */ PAT_TOKEN_WORD,
      /* barred   */ PAT_ERROR,
      /* backslashed */ PAT_ERROR,
      /* tilde */ PAT_ERROR,
      /* semitilde   */ PAT_ERROR,
      /* semicolon   */ PAT_RETURN,
      /* tildespace  */ PAT_ERROR,
      /* less     */ PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };
    static int mode_eol[MAX_MODE] =
    {
      /* entry */ PAT_NEXT+PAT_TOKEN_LINE,
      /* whitespace  */ PAT_TOKEN_SPACE,
      /* word     */ PAT_TOKEN_WORD,
      /* barred   */ PAT_PUSH+PAT_NEXT,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_NEXT+PAT_RETURN,
      /* semitilde   */ PAT_NEXT+PAT_RETURN,
      /* semicolon   */ PAT_RETURN,
      /* tildespace  */ PAT_NEXT+PAT_RETURN,
      /* less     */ PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };
    static int mode_space[MAX_MODE] =
    {
      /* entry */ PAT_NEXT+PAT_GOTO_SPACE,
      /* whitespace  */ PAT_NEXT,
      /* word     */ PAT_TOKEN_WORD,
      /* barred   */ PAT_PUSH+PAT_NEXT,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_NEXT+PAT_GOTO_TILDESPACE,
      /* semitilde   */ PAT_NEXT,
      /* semicolon   */ PAT_NEXT,
      /* tildespace  */ PAT_NEXT,
      /* less     */ PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };
    static int mode_open[MAX_MODE] =
    {
      /* entry */ PAT_NEXT+PAT_TOKEN_OPEN,
      /* whitespace  */ PAT_TOKEN_SPACE,
      /* word     */ PAT_TOKEN_WORD,
      /* barred   */ PAT_PUSH+PAT_NEXT,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_TILDE+PAT_RETURN,
      /* semitilde   */ PAT_RETURN,
      /* semicolon   */ PAT_NEXT,
      /* tildespace  */ PAT_TILDE+PAT_TOKEN_WORD,
      /* less     */ PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };
    static int mode_close[MAX_MODE] =
    {
      /* entry */ PAT_NEXT+PAT_TOKEN_CLOSE,
      /* whitespace  */ PAT_TOKEN_SPACE,
      /* word     */ PAT_TOKEN_WORD,
      /* barred   */ PAT_PUSH+PAT_NEXT,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_TILDE+PAT_RETURN,
      /* semitilde   */ PAT_RETURN,
      /* semicolon   */ PAT_NEXT,
      /* tildespace  */ PAT_TILDE+PAT_TOKEN_WORD,
      /* less     */ PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };
    static int mode_bar[MAX_MODE] =
    {
      /* entry */ PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_RETURN_TO_WORD+PAT_GOTO_BARRED+PAT_ERROR_POS,
      /* whitespace  */ PAT_TOKEN_SPACE,
      /* word     */ PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_GOTO_BARRED+PAT_RETURN_TO_SELF+PAT_ERROR_POS,
      /* barred   */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_TILDE+PAT_RETURN,
      /* semitilde   */ PAT_RETURN,
      /* semicolon */ PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_GOTO_BARRED+PAT_RETURN_TO_SELF+PAT_ERROR_POS,
      /* tildespace  */ PAT_TILDE+PAT_TOKEN_WORD,
      /* less     */ PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };
    static int mode_backslash[MAX_MODE] =
    {
      /* entry */ PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_RETURN_TO_WORD+PAT_GOTO_BACKSLASHED+PAT_ERROR_POS,
      /* whitespace  */ PAT_TOKEN_SPACE,
      /* word     */ PAT_PUSH+PAT_NEXT+PAT_DIRTY+PAT_RETURN_TO_SELF+PAT_GOTO_BACKSLASHED+PAT_ERROR_POS,
      /* barred   */ PAT_PUSH+PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_BACKSLASHED,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_TILDE+PAT_RETURN,
      /* semitilde   */ PAT_RETURN,
      /* semicolon   */ PAT_NEXT2, //+PAT_RETURN_TO_SELF+PAT_GOTO_BACKSLASHED,
      /* tildespace  */ PAT_TILDE+PAT_TOKEN_WORD,
      /* less     */ PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };
    static int mode_tilde[MAX_MODE] =
    {
      /* entry */ PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_TILDE+PAT_ERROR_POS,
      /* whitespace  */ PAT_TOKEN_SPACE,
      /* word     */ PAT_NEXT+PAT_MUTATED+PAT_RETURN_TO_SELF+PAT_GOTO_TILDE+PAT_ERROR_POS,
      /* barred   */ PAT_PUSH+PAT_NEXT,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_TILDE+PAT_RETURN,
      /* semitilde   */ PAT_RETURN,
      /* semicolon   */ PAT_NEXT+PAT_MUTATED+PAT_RETURN_TO_CALLER+PAT_GOTO_SEMITILDE+PAT_ERROR_POS,
      /* tildespace  */ PAT_TILDE+PAT_TOKEN_WORD,
      /* less     */ PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };
    static int mode_semicolon[MAX_MODE] =
    {
      /* entry */ PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_SEMICOLON,
      /* whitespace  */ PAT_TOKEN_SPACE,
      /* word     */ PAT_NEXT+PAT_MUTATED+PAT_RETURN_TO_SELF+PAT_GOTO_SEMICOLON,
      /* barred   */ PAT_PUSH+PAT_NEXT,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_TILDE+PAT_RETURN,
      /* semitilde   */ PAT_RETURN,
      /* semicolon   */ PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_SEMITILDE,
      /* tildespace  */ PAT_TILDE+PAT_TOKEN_WORD,
      /* less     */ PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };
    static int mode_else[MAX_MODE] =
    {
      /* entry */ PAT_PUSH+PAT_NEXT+PAT_GOTO_WORD,
      /* whitespace  */ PAT_TOKEN_SPACE,
      /* word     */ PAT_PUSH+PAT_NEXT,
      /* barred   */ PAT_PUSH+PAT_NEXT,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_TILDE+PAT_RETURN,
      /* semitilde   */ PAT_RETURN,
      /* semicolon   */ PAT_NEXT,
      /* tildespace  */ PAT_TILDE+PAT_TOKEN_WORD,
      /* less     */ PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };
    static int mode_parens[MAX_MODE] =
    {
      /* entry */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
      /* whitespace  */ PAT_TOKEN_SPACE,
      /* word     */ PAT_TOKEN_WORD,
      /* barred   */ PAT_PUSH+PAT_NEXT,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_TILDE+PAT_RETURN,
      /* semitilde   */ PAT_RETURN,
      /* semicolon   */ PAT_NEXT,
      /* tildespace  */ PAT_TILDE+PAT_TOKEN_WORD,
      /* less     */ PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };
    static int mode_equal[MAX_MODE] =
    {
      /* entry */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
      /* whitespace  */ PAT_TOKEN_SPACE,
      /* word     */ PAT_TOKEN_WORD,
      /* barred   */ PAT_PUSH+PAT_NEXT,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_TILDE+PAT_RETURN,
      /* semitilde   */ PAT_RETURN,
      /* semicolon   */ PAT_NEXT,
      /* tildespace  */ PAT_TILDE+PAT_TOKEN_WORD,
      /* less     */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD, 
      /* greater  */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD
    };
    static int mode_less[MAX_MODE] =
    {
      /* entry */ PAT_PUSH+PAT_NEXT+PAT_GOTO_LESS,
      /* whitespace  */ PAT_PUSH+PAT_NEXT+PAT_GOTO_LESS, //PAT_TOKEN_WORD, @boza
      /* word     */ PAT_TOKEN_WORD,
      /* barred   */ PAT_PUSH+PAT_NEXT,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_TILDE+PAT_RETURN,
      /* semitilde   */ PAT_RETURN,
      /* semicolon   */ PAT_NEXT,
      /* tildespace  */ PAT_TILDE+PAT_TOKEN_WORD,
      /* less     */ PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };
    static int mode_greater[MAX_MODE] =
    {
      /* entry */ PAT_PUSH+PAT_NEXT+PAT_GOTO_GREATER,
      /* whitespace  */ PAT_PUSH+PAT_NEXT+PAT_GOTO_GREATER, //PAT_TOKEN_WORD, @boza
      /* word     */ PAT_TOKEN_WORD,
      /* barred   */ PAT_PUSH+PAT_NEXT,
      /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
      /* tilde */ PAT_TILDE+PAT_RETURN,
      /* semitilde   */ PAT_RETURN,
      /* semicolon   */ PAT_NEXT,
      /* tildespace  */ PAT_TILDE+PAT_TOKEN_WORD,
      /* less     */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD, 
      /* greater  */ PAT_TOKEN_WORD
    };


again:
    // get action code
    GET_CHAR;
    
    
#ifdef DEBUG_TOKENS
    printf("<TOKENS> length=%d\n",len);
    if(ch<TEXT(' '))
    printf("<TOKENS> get(#%d)\t",ch);
    else
    printf("<TOKENS> get('%c')\t",ch);
    switch( mode )
    {
    case MODE_ENTRY:    printf("     ENTRY -> "); break;
    case MODE_SPACE:    printf("     SPACE -> "); break;
    case MODE_WORD:     printf("      WORD -> "); break;
    case MODE_BARRED:      printf("    BARRED -> "); break;
    case MODE_BACKSLASHED: printf(" BACKSLASH -> "); break;
    case MODE_TILDE:    printf("     TILDE -> "); break;
    case MODE_SEMITILDE:   printf(" SEMITILDE -> "); break;
    case MODE_SEMICOLON:   printf(" SEMICOLON -> "); break;
    case MODE_TILDESPACE:  printf("TILDESPACE -> "); break;
    }
#endif
    code = mode_else[mode];
    if( !len )
    code = mode_eof[mode];
    else
    {
      if( method==TOKENIZE_READWORD )
      { // tokenize as expected by READWORD
        if( ch==TEXT('|')  ) code = mode_bar[mode];
        else if( ch==TEXT('\\') ) code = mode_backslash[mode];
      }
      else
      if( method==TOKENIZE_READLIST )
      { // tokenize as data
        if(      ch==TEXT('\n') ) code = mode_eol[mode];
        else if( ch==TEXT('\r') ) code = mode_eol[mode];
        else if( ch<=TEXT(' ') ) code = mode_space[mode];
        else if( ch==TEXT('[')  ) code = mode_open[mode];
        else if( ch==TEXT(']')  ) code = mode_close[mode];
        else if( ch==TEXT('|')  ) code = mode_bar[mode];
        else if( ch==TEXT('\\') ) code = mode_backslash[mode];
        else if( ch==TEXT('~')  ) code = mode_tilde[mode];
      }
      else
      if( method==TOKENIZE_DATA )
      { // tokenize as data
        if(      ch==TEXT('\n') ) code = mode_eol[mode];
        else if( ch==TEXT('\r') ) code = mode_eol[mode];
        else if( ch<=TEXT(' ') ) code = mode_space[mode];
        else if( ch==TEXT('[')  ) code = mode_open[mode];
        else if( ch==TEXT(']')  ) code = mode_close[mode];
        else if( ch==TEXT('|')  ) code = mode_bar[mode];
        else if( ch==TEXT('\\') ) code = mode_backslash[mode];
        else if( ch==TEXT('~')  ) code = mode_tilde[mode];
        else if( ch==TEXT(';')  ) code = mode_semicolon[mode];
      }
      else
      { // tokenize as commands
        //printf("ch=%c *buf=%c\n",ch,*buffer);
        if( ch==TEXT('(') ||
            ch==TEXT(')') ) code = mode_parens[mode];
        else if( *buffer!=TEXT('"') )
        {
          if( (ch==TEXT('+') || ch==TEXT('-')) && is_number(bp) ) { }
          else if( ch==TEXT('+') || 
              ch==TEXT('-') || 
              ch==TEXT('*') ||
              ch==TEXT('/'))  code = mode_parens[mode];
          else if( ch==TEXT('=') ) code = mode_equal[mode];
          else if( ch==TEXT('<') ) code = mode_less[mode];
          else if( ch==TEXT('>') ) code = mode_greater[mode];
        }
      }
    }
    // process action code
    int _stack = -1;

    int newmode = 0;
    if( code & PAT_GOTO )
    {
      // this is pred-processing of GOTO
      // if old mode was space, and new is not,
      // then update initial position of next token
      newmode = (code>>PAT_SHIFT) & 0xF;
      if( mode==MODE_SPACE && newmode!=MODE_SPACE ) { sp = source; }
    }


    if( code & PAT_TILDE ) 
    {
      *bp++ = TEXT('~');
      #ifdef DEBUG_TOKENS
      printf("\n<TOKENS> put('%c'/%d)\n",TEXT('~'),TEXT('~'));
      #endif //DEBUG_TOKENS
    }
    if( code & PAT_PUSH )  
    {
      // push a character only if:
      // - currently not in bars |..?..|
      // - currently in bars, but not in semicolon ;..|..?..|
      if( mode!=MODE_BARRED || 
          (stack[mode]!=MODE_SEMITILDE &&
            stack[mode]!=MODE_SEMICOLON) )
      {
        //if( mode==MODE_BARRED || mode==MODE_BACKSLASHED )
        //*bp++ = ENBAR(ch);
        //else
        *bp++ = ch;
        #ifdef DEBUG_TOKENS
        printf("\n<TOKENS> put('%c'/%d)\n",ch,ch);
        #endif //DEBUG_TOKENS
        //if( ch=='\r' && *(source+1)=='\n' ) // handle CRLF cases
        //{
        //*bp++ = '\n';
        //#ifdef DEBUG_TOKENS
        //  printf("\n<TOKENS> put('%d')\n",'\n');
        // #endif //DEBUG_TOKENS
        //}
      }
    }
    if( code & PAT_NEXT2 )
    {
      source++;
      len--;
    }
    if( code & (PAT_NEXT|PAT_NEXT2) )
    {
      if( *source=='\r' )
      {
        //crlf = 0;
        if( *(source+1)=='\n' )
        {
          //crlf = 1;
          source++;
          len--;
        }
      }
      source++;
      len--;
    }
    if( code & PAT_DIRTY ) dirty = 1;
    if( code & PAT_MUTATED )  mutated = 1;
    if( code & PAT_ERROR_POS )   errpos = origlen-len-1;
    if( code & PAT_RETURN_TO_WORD ) _stack = MODE_WORD;
    if( code & PAT_RETURN_TO_SELF ) _stack = mode;
    if( code & PAT_RETURN_TO_CALLER ) _stack = stack[mode];
    if( code & PAT_GOTO )
    {
      // this is post-processing of GOTO
      mode = newmode;
      stack[mode] = _stack;
    }
    if( code & PAT_RETURN )   mode = stack[mode];
    if( code & PAT_TOKEN ) 
    {
      int _token = (code>>PAT_SHIFT) & 0xF;
      if( _token!=TOKEN_WORD ) return _token;
      if( mutated )
      {
        *bp = NULL_CHAR;
        *token  = new_word( buffer, UNKNOWN );
        #ifdef DEBUG_TOKENS
        printf("MUTATED TOKEN "); dumpln(*token);
        printf("\n\n");
        #endif
      }
      else
      {
        *token = new_subword( input, sp, source-sp /*bp-buffer*/ );
        #ifdef DEBUG_TOKENS
        printf("NORMAL TOKEN **"); dump(*token);
        printf("** (len=%d)\n\n\n",source-sp);
        #endif
      }
      return dirty?TOKEN_DIRTY_WORD:TOKEN_WORD;
    }
    if( code & PAT_ERROR )
    {
      #ifdef DEBUG_TOKENS
      printf("ERROR\n");
      #endif
      return TOKEN_ERROR;
    }

#ifdef DEBUG_TOKENS
    switch( mode )
    {
    case MODE_ENTRY:    printf("ENTRY\n"); break;
    case MODE_SPACE:    printf("SPACE\n"); break;
    case MODE_WORD:     printf("WORD\n"); break;
    case MODE_BARRED:      printf("BARRED\n"); break;
    case MODE_BACKSLASHED: printf("BACKSLASH\n"); break;
    case MODE_TILDE:    printf("TILDE\n"); break;
    case MODE_SEMITILDE:   printf("SEMITILDE\n"); break;
    case MODE_SEMICOLON:   printf("SEMICOLON\n"); break;
    case MODE_TILDESPACE:  printf("TILDESPACE\n"); break;
    }
#endif
    goto again;
  } // get_token()


  atom_t get_sublist( int level, int full_parse, atom_t* pos ) //ex2//
  {
    atom_t result  = empty_list;
    atom_t last    = empty_list;
    if( pos ) *pos = NULL; //ex2//

    atom_t token   = NULL;
    atom_t sublist_pos = NULL; //ex2//
    int    flags;
    int bracketlen = len;  // LEN of last opening bar

    int pos_from = source-STRING(input); //ex2//
    if( pos_from ) pos_from--; //ex2//

    flags = 0;//FLAG_NEWLINE;
    while( (last_token=get_token(&token,full_parse)) )
    {
      sublist_pos = NULL;

      #ifdef DEBUG_TOKENIZATION
      switch(last_token)
      {
      case TOKEN_END:    printf("TOKEN_END\n"); break;
      case TOKEN_SPACE:  printf("TOKEN_SPACE\n"); break;
      case TOKEN_WORD:   printf("TOKEN_WORD    @"); dump(token); printf("@\n"); break;
      case TOKEN_DIRTY_WORD: printf("TOKEN_|WORD|  @"); dump(token); printf("@\n"); break;
      case TOKEN_OPEN:   printf("TOKEN_OPEN    [\n"); break;
      case TOKEN_CLOSE:  printf("TOKEN_CLOSE   ]\n"); break;
      case TOKEN_LINEEND:   printf("TOKEN_LINEEND\n"); break;
      case TOKEN_ERROR:  printf("TOKEN_ERROR\n"); break;
      }
      #endif

      if( last_token==TOKEN_ERROR ) return result;
      if( last_token==TOKEN_SPACE ) 
      {
        if( IS_NOT_EMPTY(last) ) SET_FLAGS( last, FLAG_BEFORE_SPACE );
        flags |= FLAG_AFTER_SPACE;
        continue;
      }
      if( last_token==TOKEN_DIRTY_WORD )
      {
        //printf("###BEFORE="); dumpln(token);
        atom_t x = purify( token );
        DEUSE( token );
        token = x;
        //printf("###AFTER="); dumpln(token);
      }
      if( last_token==TOKEN_CLOSE ) break;
      if( last_token==TOKEN_LINEEND )
      {
        flags |= FLAG_NEWLINE;
        continue;
      }
      if( last_token==TOKEN_OPEN)
      {
        token = get_sublist( level+1, TOKENIZE_DATA, &sublist_pos ); // recursive //ex2//

        if( last_token==TOKEN_ERROR )
        {
          DEUSE( token );
          if( sublist_pos ) DEUSE(sublist_pos);
          sublist_pos = NULL;
          break;
        }
      }

      if( method==TOKENIZE_COMMANDS )
      flags |= FLAG_TOKENIZED_COMMANDS;

      if( method==TOKENIZE_DATA )
      flags |= FLAG_TOKENIZED_DATA;

      if( method==TOKENIZE_COMMANDS &&
          last_token==TOKEN_WORD &&
          LENGTH(token)>1 &&
          *STRING(token)==TEXT('?') &&
          *(STRING(token)+1)>=TEXT('0') &&
          *(STRING(token)+1)<=TEXT('9') )
      {
        //printf(">>>%d %d\n", last_token==TOKEN_WORD, last_token==TOKEN_DIRTY_WORD);
        // process template ?nn->(? nn) for command tokenization
        //printf("append token **"); dump(token); printf("**\n");
        atom_t new_qoken = new_subword( token, STRING(token), 1 );
        atom_t new_token = new_subword( token, STRING(token)+1, LENGTH(token)-1 );
        DEUSE( token );

        append( new_word(TEXT("("),-1), &result, &last );   // (
        append( new_qoken, &result, &last );    // ?
        append( new_token, &result, &last );    // nn
        append( new_word(TEXT(")"),-1), &result, &last );   // )
      }
      else
      {
        // normal token, no more processing needed
        if( sublist_pos ) //ex//
        {
          append_ex( token, &result, &last );
          POS( last ) = sublist_pos;

          //printf("\n\nSET EXTENDED POSITION ");
          //dump_atom(sublist_pos,1); printf("\n");
          //printf("CURRENT RESULT ");
          //dump_atom(result,1); printf("\n\n");

          sublist_pos = NULL;
        }
        else
        {
          append( token, &result, &last );
        }
        SET_FLAGS( last, flags );
      }

      flags = 0;
      if( last_token==TOKEN_ERROR ) break;
    }

    // test for unmatching square brackets
    // i.e. ...[... or ...]...
    if( level )
    {
      if( last_token==TOKEN_END )
      {
        errpos = origlen-bracketlen-1;
        last_token = TOKEN_ERROR;
      }
    }
    else
    {
      if( last_token==TOKEN_CLOSE )
      {
        errpos = origlen-len-1;
        last_token = TOKEN_ERROR;
      }
    }

    int pos_to = source-STRING(input);
    if( pos_to ) pos_to--;

    if( pos && method == TOKENIZE_DATA ) //ex2//
    {
      *pos = new_subword( input, STRING(input)+pos_from, pos_to-pos_from+1 );
    }

    return result;
  } // get_sublist()

  atom_t result = get_sublist( 0, method, NULL ); //ex//
  
  // in case of error return empty list
  if( last_token==TOKEN_ERROR )
  {
    //printf("ERROR RESULT=");dumpln(result);
    DEUSE( result );
    result = new_parse_error( ERROR_INCOMPLETE_PAIR, errpos, input );
  }


  DEALLOC( buffer );
  //printf("#########");
  //dump_atom(result,1);
  //printf("######\n");

  return result;
}
Parameters:
wordword containing source text
Returns:
atom with the source text with trimmed shell comment

Trims a shell comment from the beginning of the word. Shell comment can be only the first line if its first two characters are #!. If a shell comment is trimmed, then the result is a subword from the first character on the second line, otherwise the input word is returned as is but with increased reference count.

{
  #ifdef SAFEMODE
  assert( IS_WORD(word) || IS_SUBWORD(word) );
  #endif

  chars_t source = STRING(word);
  int_t   len    = LENGTH(word);

  // if there are no enough characters just exit
  if( LENGTH(word)<2 ) return USE(word);

  // if the first two characters are not #! then exit
  if( *source!=TEXT('#') || *(source+1)!=TEXT('!') ) return USE(word);

  // skip the line
  while( len && *source!=TEXT('\n') )
  {
    source++;
    len--;
  }

  // return a subword. Pay attention to always reference
  // the main host word because the input could be a word
  // or a subword.
  if( IS_WORD(word) )
  return new_subword( word, source, len );
  else
  return new_subword( WORD(word), source, len );
}
atom_t purify ( atom_t  word)
Parameters:
wordword to be purified
Returns:
purified word

Purifies a word by processing all backslashes and bars. Returns a new word if needed. Assumes that the input needs purification.

{
  //return USE(word);
  #ifdef SAFEMODE
  assert( IS_WORD(word) || IS_SUBWORD(word) );
  #endif

  chars_t source = STRING(word);
  int_t   len    = LENGTH(word);

  chars_t buffer = alloca( CHAR_SIZE*len ); // buffer for the longest word
  chars_t bp = buffer;

  int need_enbar    = 0;
  int is_mutated    = 0; // set to 1 if the word is mutated
  int in_backslash  = 0;
  int in_bars       = 0;
  for( ; len; len--,source++ )
  {
    need_enbar = in_bars || in_backslash;
    if( in_backslash )
    {
      in_backslash = 0;
    }
    else if( *source==TEXT('\\') )
    {
      is_mutated = 1;
      in_backslash = 1;
      continue;
    }
    else if( *source==TEXT('|') )
    {
      is_mutated = 1;
      in_bars = !in_bars;
      continue;
    }
    if( need_enbar )
    *bp++ = ENBAR(*source);
    else
    *bp++ = *source;

    //if( need_enbar )
    //printf(" PURIFY %d %d\n",*source,ENBAR(*source));
    //else
    //printf(" PURIFY %d   \n",*source );
  }

  *bp = NULL_CHAR;

  if( is_mutated )
  return new_word( buffer, bp-buffer );
  else
  return USE(word);
}
Parameters:
funcvar atom for the parse context
Returns:
empty_list or an error atom

Parses completely a function. Its source is stored in its body as word, subword, data-tokenized list or command-tokenized list. Building algorithm:

  • tokenization of body as commands
  • extracting all TO ... ENDs and create them as subfunctions
  • parsing the func's body into abstract syntax tree
  • recursively build trees of subfunctions
{
  // exit is function is already treefied
  if( IS_NOT_EMPTY(TREE(func)) ) return empty_list;

  //printf("BUILD_SYNTAX_TREE(FUNC=");
  //dump(NAME(func));
  //printf(",SOURCE=");
  //dump(SOURCE(func));
  //printf(",BODY=");
  //dump(BODY(func));
  //printf(",LOCALS=");
  //dump(LOCALS(func));
  //printf(")\n\n");

  if( IS_EMPTY(BODY(func)) )
  {
    // Step 1. Tokenize
    //printf("SOURCE="); dumpln(SOURCE(func));
    atom_t tokens1 = tokenize( SOURCE(func), TOKENIZE_DATA );
    if( IS_ERROR(tokens1) ) return tokens1;
    //printf("TOKENS1="); dumpln(tokens1);

    atom_t tokens2 = tokenize( tokens1, TOKENIZE_COMMANDS );
    DEUSE( tokens1 );
    if( IS_ERROR(tokens2) ) return tokens2;
    //printf("TOKENS2="); dumpln(tokens2);

    // Step 2. Extract TO..END's
    atom_t body = preparse( tokens2, func, LEVEL(func) );
    if( IS_ERROR(body) ) return body;
    DEUSE( BODY(func) );
    BODY(func) = body;
    //printf("BODY="); dumpln(BODY(func));
  }


  // Step 3. Parse function body
  //printf("BODY="); dumpln(BODY(func));
  atom_t tree = parse( BODY(func), func, 1 );
  if( IS_ERROR(tree) ) return tree;
  DEUSE(TREE(func));
  TREE(func) = tree;
  //printf("TREE="); dumpln(TREE(func));

  // because the might be some new TO..ENDs
  // scan all locals and build those which
  // have no trees
  atom_t local;
  atom_t locals;
  for( locals=LOCALS(func); IS_NOT_EMPTY(locals); locals=CDR(locals) )
  {
    local = CAR(locals);
    if( !DESCR2(local) ) continue;
    atom_t x = build_syntax_tree( local );
    if( IS_ERROR(x) ) return x;
  }

  return empty_list;
}

Variable Documentation

char_t enbar[128]

[ HOME | INDEX | ATOMS | VARS | REFERENCE ]
Lhogho Developer's Documentation
Tue Feb 7 2012