Converters
Used to convert one string to another. In ASCII mode all conversions are like identities - i.e. no conversion is actually done.
#define	TEXT(a) L##a
	fix string constants
#define	FILENAME(x) UTF16_to_ASCII(x)
	fix file names
#define	UNFILENAME(x) ASCII_to_UTF16(x)
	unfix file names
Format strings
#define	NULL_CHAR TEXT('\0')
#define	STR "%S"
#define	CHR "%C"
String functions
They are used to maintain dual UNICODE/ASCII processing. This is needed because there are different names for mutibyte and widechar functions.
#define	NO_MORE WEOF
#define	PUTCHAR(x, y) putwc(x,y)
#define	GETCHAR(x) getwc(x)
#define	STRLEN(x) wcslen(x)
#define	STRNCPY(x, y, z) wcsncpy(x,y,z)
#define	STRCMP(x, y) wcscmp(x,y)
#define	STRTOD(x, y) wcstod(x,y)
#define	STRTOL(x, y) wcstoll(x,y,0)
#define	TOUPPER(x) towupper(x)
#define	TOLOWER(x) towlower(x)
#define	ISDIGIT(x) iswdigit(x)
#define	STRCHR(x, y) wcschr(x,y)
#define	PRINT(x,...) printf(x,__VA_ARGS__)
#define	STRFTIME(x, y, z, t) wcsftime(x,y,z,t)
#define	SPRINTF(x, n, y, z) swprintf(x,n,y,z)
#define	SPRINT(x, n, y,...) swprintf(x,n,y,__VA_ARGS__)
Functions
chars_t	UTF8_to_UTF16 (byte_t *source)
	converts UTF-8 to UTF-16
byte_t *	UTF16_to_UTF8 (chars_t source)
char *	UTF16_to_ASCII (chars_t ws)
	converts UTF-16 to ASCII
chars_t	ASCII_to_UTF16 (const char *s)
	converts ASCII to UTF-16
chars_t	ASCII_to_ASCII (const char *s)
	converts ASCII to ASCII
void *	load_file (chars_t wfilename, int *filesize)
	load text file into a word atom

Define Documentation

#define TEXT ( a ) L##a

#define FILENAME ( x ) UTF16_to_ASCII(x)

#define UNFILENAME ( x ) ASCII_to_UTF16(x)

#define NULL_CHAR TEXT('\0')

#define STR "%S"

#define CHR "%C"

#define NO_MORE WEOF

#define PUTCHAR	(	x,
		y
	)	putwc(x,y)

#define GETCHAR ( x ) getwc(x)

#define STRLEN ( x ) wcslen(x)

#define STRNCPY	(	x,
		y,
		z
	)	wcsncpy(x,y,z)

#define STRCMP	(	x,
		y
	)	wcscmp(x,y)

#define STRTOD	(	x,
		y
	)	wcstod(x,y)

#define STRTOL	(	x,
		y
	)	wcstoll(x,y,0)

#define TOUPPER ( x ) towupper(x)

#define TOLOWER ( x ) towlower(x)

#define ISDIGIT ( x ) iswdigit(x)

#define STRCHR	(	x,
		y
	)	wcschr(x,y)

#define PRINT	(	x,
		...
	)	printf(x,__VA_ARGS__)

#define STRFTIME	(	x,
		y,
		z,
		t
	)	wcsftime(x,y,z,t)

#define SPRINTF	(	x,
		n,
		y,
		z
	)	swprintf(x,n,y,z)

#define SPRINT	(	x,
		n,
		y,
		...
	)	swprintf(x,n,y,__VA_ARGS__)

Function Documentation

chars_t UTF8_to_UTF16 ( byte_t * source )

Parameters:

source characters to convert

Returns:: converted string

Note:: defined only if UNICODE_CHARS symbol is defined

Converts string of multibyte UTF-8 encoding to widechar UTF-16LE encoding.

{
  int len = strlen ((char*)source);
  wchar_t *buffer = alloca( CHAR_SIZE*(len+1) );
  wchar_t *buf = buffer;
  unsigned long wc;

  while( len>0 )
    {
      if( (*source & 0x80)==0x00 )
   { // 00-7F     [0zzz-zzzz]
     wc = (byte_t)*source++;
     len -= 1;
   }
      else if( (*source & 0xE0)==0xC0 )
   { // 080-7FF      [110y-yyyy] [10zz-zzzz]
     wc = *source & 0x1F;
     source++;
     wc = (wc<<6) + (*source & 0x3F);
     source++;
     len -= 2;
   }
      else if( ((byte_t)*source & 0xF0)==0xE0 )
   { // 0800-FFFF    [1110-xxxx] [10yy-yyyy] [10zz-zzzz]
     wc = *source & 0x1F;
     source++;
     wc = (wc<<6) + (*source & 0x3F);
     source++;
     wc = (wc<<6) + (*source & 0x3F);
     source++;
     len -= 3;
   }
      else
   { // 01000-10FFFF [1111-wwww] [10xx-xxxx] [10yy-yyyy] [10zz-zzzz]
     wc = *source & 0x1F;
     source++;
     wc = (wc<<6) + (*source & 0x3F);
     source++;
     wc = (wc<<6) + (*source & 0x3F);
     source++;
     wc = (wc<<6) + (*source & 0x3F);
     source++;
     len -= 4;
   }
      *buf = (unsigned short)wc;
      buf++;
    }

  *buf = L'\0';

  len = CHAR_SIZE*(buf-buffer+1);
  buf = ALLOC( len );
  memcpy( buf, buffer, len );

  return buf;
}

byte_t* UTF16_to_UTF8 ( chars_t source )

{
  int len = STRLEN( source );
  byte_t *buffer = alloca( len+1 );
  byte_t *buf = buffer;
  
  while( len>0 )
    {
      char_t wc = *source;
      //printf("\nCODE=%4x|",wc);
      if( wc < 0x0080 )
   { // 0000-007F
     // from: [0xxxxxxx]
     //   to: [0xxxxxxx]
     *buf++ = (byte_t)wc; 
   }
      else if( wc < 0x0800 )
   { // 0080-07FF
     // from: [00000yyy yyxxxxxx]
     //   to: [110yyyyy] [10xxxxxx]
     *buf++ = 0xC0 | (byte_t)(wc >> 6);
     *buf++ = 0x80 | (byte_t)(wc & 0x3F); 
   }
      else
   { // 0800-FFFF
     // from: [zzzzyyyy yyxxxxxx]
     //   to: [1110zzzz] [10yyyyyy] [10xxxxxx]
     *buf++ = 0xE0 | (byte_t)(wc >> 12);
     *buf++ = 0x80 | (byte_t)((wc >> 6) & 0x3F);
     *buf++ = 0x80 | (byte_t)(wc & 0x3F);
   }
      len--;
      source++;
    }

  *buf = '\0';

  len = buf-buffer+1;
  buf = ALLOC( len );
  memcpy( buf, buffer, len );

  return buf;
}

char* UTF16_to_ASCII ( chars_t ws )

Converts string of widechar UTF-16LE encoding to ASCII encoding. The input string is not freed.

Parameters:

ws	characters to convert

Returns:: converted string

Note:: function defined only if

{
  size_t len = wcslen( ws );
  char* buffer = alloca( 4*(len+1) ); // assume one utf16 can expand to 4 bytes max
  char* buf = buffer;
  mbstate_t state;
  size_t nbytes;

  memset (&state, '\0', sizeof (state));
  while (len>0)
    {
      nbytes = wcrtomb (buf, *ws, &state);
      buf += nbytes;
      len -= 1;
      ws += 1;
    }
  *buf = '\0';

  len = buf-buffer+1;
  buf = ALLOC( len );
  memcpy( buf, buffer, len );
  return buf;
}

chars_t ASCII_to_UTF16 ( const char * s )

Converts string of ASCII encoding to widechar UTF-16LE encoding.

Parameters:

s	characters to convert

Returns:: converted string

Note:: defined only if UNICODE_CHARS

{
  //printf("===%s===\n",s);

  size_t len = strlen(s);
  wchar_t *buffer = ALLOC( CHAR_SIZE*(len+1) );
  wchar_t *buf = buffer;
  mbstate_t state;
  size_t nbytes;
  memset (&state, '\0', sizeof (state));
  while (len>0)
    {
      nbytes = mbrtowc (buf, s, len, &state);
      buf++;
      len -= nbytes;
      s += nbytes;
    }
  *buf = L'\0';

  len = CHAR_SIZE*(buf-buffer+1);
  buf = ALLOC( len );
  memcpy( buf, buffer, len );
  DEALLOC( buffer );
  return buf;
}

chars_t ASCII_to_ASCII ( const char * s )

Parameters:

s	characters to convert

Returns:: converted string

Note:: defined only if UNICODE_CHARS symbol is not defined

Converts string of ASCII encoding to ASCII. Actually does not covert anything. This function is used because it uses the ALLOC() macro which helps tracing memory allocation.

{
#ifndef UNICODE_CHARS
  size_t len = STRLEN(s)+1;
  char* buf = ALLOC( len );
  memcpy( buf, s, len );
  return buf;
#else
  return (chars_t)0;
#endif //UNICODE_CHARS
}

void* load_file	(	chars_t	wfilename,
		int *	filesize
	)

Parameters:

wfilename	file name
filesize	file size

Returns:: word atom

Loads a text file which can be ASCII, multibyte UTF-8 or widechar UTF-16LE encoding. The size of the file is returned in filesize so that the caller can append null character if needed.

{
  FILE* file;     // file stream
  void* buffer;      // file buffer
  struct stat st_info;  // file attributes

  // convert filename to multibyte
  char* filename = FILENAME(wfilename);

  errno = 0;
  
  // open file
  file = fopen( filename, "rb" );
  #ifdef SAFEMODE
    if( errno ) return NULL;
  #endif //SAFEMODE

  // get file size
  fstat( fileno(file), &st_info );
  *filesize = st_info.st_size;

  // file name is not needed any more
  #ifdef UNICODE_CHARS
    DEALLOC( filename );
  #endif

  // allocate buffer
  buffer = ALLOC(*filesize+1);
  #ifdef SAFEMODE
  if( !buffer )
    {
      fclose( file );
      errno = ENOMEM;
      return NULL;
    }
  #endif //SAFEMODE

  // read file into the buffer
  if( *filesize && !fread(buffer,1,*filesize,file ) )
    {
      #ifdef SAFEMODE
        DEALLOC( buffer );
        return NULL;
      #endif //SAFEMODE
    }

  fclose( file );
  #ifdef SAFEMODE
  if( errno )
    {
      free( buffer );
      return NULL;
    }
  #endif //SAFEMODE

  return buffer;
}

Converters

Format strings

String functions

Functions

Define Documentation

Function Documentation