Lhogho  0.0.027
unicode.h File Reference

Converters

Used to convert one string to another. In ASCII mode all conversions are like identities - i.e. no conversion is actually done.

#define TEXT(a)   L##a
 fix string constants
#define FILENAME(x)   UTF16_to_ASCII(x)
 fix file names
#define UNFILENAME(x)   ASCII_to_UTF16(x)
 unfix file names

Format strings

#define NULL_CHAR   TEXT('\0')
#define STR   "%S"
#define CHR   "%C"

String functions

They are used to maintain dual UNICODE/ASCII processing. This is needed because there are different names for mutibyte and widechar functions.

#define NO_MORE   WEOF
#define PUTCHAR(x, y)   putwc(x,y)
#define GETCHAR(x)   getwc(x)
#define STRLEN(x)   wcslen(x)
#define STRNCPY(x, y, z)   wcsncpy(x,y,z)
#define STRCMP(x, y)   wcscmp(x,y)
#define STRTOD(x, y)   wcstod(x,y)
#define STRTOL(x, y)   wcstoll(x,y,0)
#define TOUPPER(x)   towupper(x)
#define TOLOWER(x)   towlower(x)
#define ISDIGIT(x)   iswdigit(x)
#define STRCHR(x, y)   wcschr(x,y)
#define PRINT(x,...)   printf(x,__VA_ARGS__)
#define STRFTIME(x, y, z, t)   wcsftime(x,y,z,t)
#define SPRINTF(x, n, y, z)   swprintf(x,n,y,z)
#define SPRINT(x, n, y,...)   swprintf(x,n,y,__VA_ARGS__)

Functions

chars_t UTF8_to_UTF16 (byte_t *source)
 converts UTF-8 to UTF-16
byte_tUTF16_to_UTF8 (chars_t source)
char * UTF16_to_ASCII (chars_t ws)
 converts UTF-16 to ASCII
chars_t ASCII_to_UTF16 (const char *s)
 converts ASCII to UTF-16
chars_t ASCII_to_ASCII (const char *s)
 converts ASCII to ASCII
voidload_file (chars_t wfilename, int *filesize)
 load text file into a word atom

Define Documentation

#define TEXT (   a)    L##a
#define FILENAME (   x)    UTF16_to_ASCII(x)
#define UNFILENAME (   x)    ASCII_to_UTF16(x)
#define NULL_CHAR   TEXT('\0')
#define STR   "%S"
#define CHR   "%C"
#define NO_MORE   WEOF
#define PUTCHAR (   x,
 
)    putwc(x,y)
#define GETCHAR (   x)    getwc(x)
#define STRLEN (   x)    wcslen(x)
#define STRNCPY (   x,
  y,
 
)    wcsncpy(x,y,z)
#define STRCMP (   x,
 
)    wcscmp(x,y)
#define STRTOD (   x,
 
)    wcstod(x,y)
#define STRTOL (   x,
 
)    wcstoll(x,y,0)
#define TOUPPER (   x)    towupper(x)
#define TOLOWER (   x)    towlower(x)
#define ISDIGIT (   x)    iswdigit(x)
#define STRCHR (   x,
 
)    wcschr(x,y)
#define PRINT (   x,
  ... 
)    printf(x,__VA_ARGS__)
#define STRFTIME (   x,
  y,
  z,
 
)    wcsftime(x,y,z,t)
#define SPRINTF (   x,
  n,
  y,
 
)    swprintf(x,n,y,z)
#define SPRINT (   x,
  n,
  y,
  ... 
)    swprintf(x,n,y,__VA_ARGS__)

Function Documentation

chars_t UTF8_to_UTF16 ( byte_t source)
Parameters:
sourcecharacters to convert
Returns:
converted string
Note:
defined only if UNICODE_CHARS symbol is defined

Converts string of multibyte UTF-8 encoding to widechar UTF-16LE encoding.

{
  int len = strlen ((char*)source);
  wchar_t *buffer = alloca( CHAR_SIZE*(len+1) );
  wchar_t *buf = buffer;
  unsigned long wc;

  while( len>0 )
    {
      if( (*source & 0x80)==0x00 )
   { // 00-7F     [0zzz-zzzz]
     wc = (byte_t)*source++;
     len -= 1;
   }
      else if( (*source & 0xE0)==0xC0 )
   { // 080-7FF      [110y-yyyy] [10zz-zzzz]
     wc = *source & 0x1F;
     source++;
     wc = (wc<<6) + (*source & 0x3F);
     source++;
     len -= 2;
   }
      else if( ((byte_t)*source & 0xF0)==0xE0 )
   { // 0800-FFFF    [1110-xxxx] [10yy-yyyy] [10zz-zzzz]
     wc = *source & 0x1F;
     source++;
     wc = (wc<<6) + (*source & 0x3F);
     source++;
     wc = (wc<<6) + (*source & 0x3F);
     source++;
     len -= 3;
   }
      else
   { // 01000-10FFFF [1111-wwww] [10xx-xxxx] [10yy-yyyy] [10zz-zzzz]
     wc = *source & 0x1F;
     source++;
     wc = (wc<<6) + (*source & 0x3F);
     source++;
     wc = (wc<<6) + (*source & 0x3F);
     source++;
     wc = (wc<<6) + (*source & 0x3F);
     source++;
     len -= 4;
   }
      *buf = (unsigned short)wc;
      buf++;
    }

  *buf = L'\0';

  len = CHAR_SIZE*(buf-buffer+1);
  buf = ALLOC( len );
  memcpy( buf, buffer, len );

  return buf;
}
{
  int len = STRLEN( source );
  byte_t *buffer = alloca( len+1 );
  byte_t *buf = buffer;
  
  while( len>0 )
    {
      char_t wc = *source;
      //printf("\nCODE=%4x|",wc);
      if( wc < 0x0080 )
   { // 0000-007F
     // from: [0xxxxxxx]
     //   to: [0xxxxxxx]
     *buf++ = (byte_t)wc; 
   }
      else if( wc < 0x0800 )
   { // 0080-07FF
     // from: [00000yyy yyxxxxxx]
     //   to: [110yyyyy] [10xxxxxx]
     *buf++ = 0xC0 | (byte_t)(wc >> 6);
     *buf++ = 0x80 | (byte_t)(wc & 0x3F); 
   }
      else
   { // 0800-FFFF
     // from: [zzzzyyyy yyxxxxxx]
     //   to: [1110zzzz] [10yyyyyy] [10xxxxxx]
     *buf++ = 0xE0 | (byte_t)(wc >> 12);
     *buf++ = 0x80 | (byte_t)((wc >> 6) & 0x3F);
     *buf++ = 0x80 | (byte_t)(wc & 0x3F);
   }
      len--;
      source++;
    }

  *buf = '\0';

  len = buf-buffer+1;
  buf = ALLOC( len );
  memcpy( buf, buffer, len );

  return buf;
}
char* UTF16_to_ASCII ( chars_t  ws)

Converts string of widechar UTF-16LE encoding to ASCII encoding. The input string is not freed.

Parameters:
wscharacters to convert
Returns:
converted string
Note:
function defined only if
{
  size_t len = wcslen( ws );
  char* buffer = alloca( 4*(len+1) ); // assume one utf16 can expand to 4 bytes max
  char* buf = buffer;
  mbstate_t state;
  size_t nbytes;

  memset (&state, '\0', sizeof (state));
  while (len>0)
    {
      nbytes = wcrtomb (buf, *ws, &state);
      buf += nbytes;
      len -= 1;
      ws += 1;
    }
  *buf = '\0';

  len = buf-buffer+1;
  buf = ALLOC( len );
  memcpy( buf, buffer, len );
  return buf;
}
chars_t ASCII_to_UTF16 ( const char *  s)

Converts string of ASCII encoding to widechar UTF-16LE encoding.

Parameters:
scharacters to convert
Returns:
converted string
Note:
defined only if UNICODE_CHARS
{
  //printf("===%s===\n",s);

  size_t len = strlen(s);
  wchar_t *buffer = ALLOC( CHAR_SIZE*(len+1) );
  wchar_t *buf = buffer;
  mbstate_t state;
  size_t nbytes;
  memset (&state, '\0', sizeof (state));
  while (len>0)
    {
      nbytes = mbrtowc (buf, s, len, &state);
      buf++;
      len -= nbytes;
      s += nbytes;
    }
  *buf = L'\0';

  len = CHAR_SIZE*(buf-buffer+1);
  buf = ALLOC( len );
  memcpy( buf, buffer, len );
  DEALLOC( buffer );
  return buf;
}
chars_t ASCII_to_ASCII ( const char *  s)
Parameters:
scharacters to convert
Returns:
converted string
Note:
defined only if UNICODE_CHARS symbol is not defined

Converts string of ASCII encoding to ASCII. Actually does not covert anything. This function is used because it uses the ALLOC() macro which helps tracing memory allocation.

{
#ifndef UNICODE_CHARS
  size_t len = STRLEN(s)+1;
  char* buf = ALLOC( len );
  memcpy( buf, s, len );
  return buf;
#else
  return (chars_t)0;
#endif //UNICODE_CHARS
}
void* load_file ( chars_t  wfilename,
int *  filesize 
)
Parameters:
wfilenamefile name
filesizefile size
Returns:
word atom

Loads a text file which can be ASCII, multibyte UTF-8 or widechar UTF-16LE encoding. The size of the file is returned in filesize so that the caller can append null character if needed.

{
  FILE* file;     // file stream
  void* buffer;      // file buffer
  struct stat st_info;  // file attributes

  // convert filename to multibyte
  char* filename = FILENAME(wfilename);

  errno = 0;
  
  // open file
  file = fopen( filename, "rb" );
  #ifdef SAFEMODE
    if( errno ) return NULL;
  #endif //SAFEMODE

  // get file size
  fstat( fileno(file), &st_info );
  *filesize = st_info.st_size;

  // file name is not needed any more
  #ifdef UNICODE_CHARS
    DEALLOC( filename );
  #endif

  // allocate buffer
  buffer = ALLOC(*filesize+1);
  #ifdef SAFEMODE
  if( !buffer )
    {
      fclose( file );
      errno = ENOMEM;
      return NULL;
    }
  #endif //SAFEMODE

  // read file into the buffer
  if( *filesize && !fread(buffer,1,*filesize,file ) )
    {
      #ifdef SAFEMODE
        DEALLOC( buffer );
        return NULL;
      #endif //SAFEMODE
    }

  fclose( file );
  #ifdef SAFEMODE
  if( errno )
    {
      free( buffer );
      return NULL;
    }
  #endif //SAFEMODE

  return buffer;
}

[ HOME | INDEX | ATOMS | VARS | REFERENCE ]
Lhogho Developer's Documentation
Tue Feb 7 2012