Lhogho
0.0.027
|
Converters | |
Used to convert one string to another. In ASCII mode all conversions are like identities - i.e. no conversion is actually done. | |
#define | TEXT(a) L##a |
fix string constants | |
#define | FILENAME(x) UTF16_to_ASCII(x) |
fix file names | |
#define | UNFILENAME(x) ASCII_to_UTF16(x) |
unfix file names | |
Format strings | |
#define | NULL_CHAR TEXT('\0') |
#define | STR "%S" |
#define | CHR "%C" |
String functions | |
They are used to maintain dual UNICODE/ASCII processing. This is needed because there are different names for mutibyte and widechar functions. | |
#define | NO_MORE WEOF |
#define | PUTCHAR(x, y) putwc(x,y) |
#define | GETCHAR(x) getwc(x) |
#define | STRLEN(x) wcslen(x) |
#define | STRNCPY(x, y, z) wcsncpy(x,y,z) |
#define | STRCMP(x, y) wcscmp(x,y) |
#define | STRTOD(x, y) wcstod(x,y) |
#define | STRTOL(x, y) wcstoll(x,y,0) |
#define | TOUPPER(x) towupper(x) |
#define | TOLOWER(x) towlower(x) |
#define | ISDIGIT(x) iswdigit(x) |
#define | STRCHR(x, y) wcschr(x,y) |
#define | PRINT(x,...) printf(x,__VA_ARGS__) |
#define | STRFTIME(x, y, z, t) wcsftime(x,y,z,t) |
#define | SPRINTF(x, n, y, z) swprintf(x,n,y,z) |
#define | SPRINT(x, n, y,...) swprintf(x,n,y,__VA_ARGS__) |
Functions | |
chars_t | UTF8_to_UTF16 (byte_t *source) |
converts UTF-8 to UTF-16 | |
byte_t * | UTF16_to_UTF8 (chars_t source) |
char * | UTF16_to_ASCII (chars_t ws) |
converts UTF-16 to ASCII | |
chars_t | ASCII_to_UTF16 (const char *s) |
converts ASCII to UTF-16 | |
chars_t | ASCII_to_ASCII (const char *s) |
converts ASCII to ASCII | |
void * | load_file (chars_t wfilename, int *filesize) |
load text file into a word atom |
#define TEXT | ( | a | ) | L##a |
#define FILENAME | ( | x | ) | UTF16_to_ASCII(x) |
#define UNFILENAME | ( | x | ) | ASCII_to_UTF16(x) |
#define STR "%S" |
#define CHR "%C" |
#define NO_MORE WEOF |
#define PUTCHAR | ( | x, | |
y | |||
) | putwc(x,y) |
#define GETCHAR | ( | x | ) | getwc(x) |
#define STRLEN | ( | x | ) | wcslen(x) |
#define STRNCPY | ( | x, | |
y, | |||
z | |||
) | wcsncpy(x,y,z) |
#define STRCMP | ( | x, | |
y | |||
) | wcscmp(x,y) |
#define STRTOD | ( | x, | |
y | |||
) | wcstod(x,y) |
#define STRTOL | ( | x, | |
y | |||
) | wcstoll(x,y,0) |
#define TOUPPER | ( | x | ) | towupper(x) |
#define TOLOWER | ( | x | ) | towlower(x) |
#define ISDIGIT | ( | x | ) | iswdigit(x) |
#define STRCHR | ( | x, | |
y | |||
) | wcschr(x,y) |
#define PRINT | ( | x, | |
... | |||
) | printf(x,__VA_ARGS__) |
#define STRFTIME | ( | x, | |
y, | |||
z, | |||
t | |||
) | wcsftime(x,y,z,t) |
#define SPRINTF | ( | x, | |
n, | |||
y, | |||
z | |||
) | swprintf(x,n,y,z) |
#define SPRINT | ( | x, | |
n, | |||
y, | |||
... | |||
) | swprintf(x,n,y,__VA_ARGS__) |
chars_t UTF8_to_UTF16 | ( | byte_t * | source | ) |
source | characters to convert |
Converts string of multibyte UTF-8 encoding to widechar UTF-16LE encoding.
{ int len = strlen ((char*)source); wchar_t *buffer = alloca( CHAR_SIZE*(len+1) ); wchar_t *buf = buffer; unsigned long wc; while( len>0 ) { if( (*source & 0x80)==0x00 ) { // 00-7F [0zzz-zzzz] wc = (byte_t)*source++; len -= 1; } else if( (*source & 0xE0)==0xC0 ) { // 080-7FF [110y-yyyy] [10zz-zzzz] wc = *source & 0x1F; source++; wc = (wc<<6) + (*source & 0x3F); source++; len -= 2; } else if( ((byte_t)*source & 0xF0)==0xE0 ) { // 0800-FFFF [1110-xxxx] [10yy-yyyy] [10zz-zzzz] wc = *source & 0x1F; source++; wc = (wc<<6) + (*source & 0x3F); source++; wc = (wc<<6) + (*source & 0x3F); source++; len -= 3; } else { // 01000-10FFFF [1111-wwww] [10xx-xxxx] [10yy-yyyy] [10zz-zzzz] wc = *source & 0x1F; source++; wc = (wc<<6) + (*source & 0x3F); source++; wc = (wc<<6) + (*source & 0x3F); source++; wc = (wc<<6) + (*source & 0x3F); source++; len -= 4; } *buf = (unsigned short)wc; buf++; } *buf = L'\0'; len = CHAR_SIZE*(buf-buffer+1); buf = ALLOC( len ); memcpy( buf, buffer, len ); return buf; }
byte_t* UTF16_to_UTF8 | ( | chars_t | source | ) |
{ int len = STRLEN( source ); byte_t *buffer = alloca( len+1 ); byte_t *buf = buffer; while( len>0 ) { char_t wc = *source; //printf("\nCODE=%4x|",wc); if( wc < 0x0080 ) { // 0000-007F // from: [0xxxxxxx] // to: [0xxxxxxx] *buf++ = (byte_t)wc; } else if( wc < 0x0800 ) { // 0080-07FF // from: [00000yyy yyxxxxxx] // to: [110yyyyy] [10xxxxxx] *buf++ = 0xC0 | (byte_t)(wc >> 6); *buf++ = 0x80 | (byte_t)(wc & 0x3F); } else { // 0800-FFFF // from: [zzzzyyyy yyxxxxxx] // to: [1110zzzz] [10yyyyyy] [10xxxxxx] *buf++ = 0xE0 | (byte_t)(wc >> 12); *buf++ = 0x80 | (byte_t)((wc >> 6) & 0x3F); *buf++ = 0x80 | (byte_t)(wc & 0x3F); } len--; source++; } *buf = '\0'; len = buf-buffer+1; buf = ALLOC( len ); memcpy( buf, buffer, len ); return buf; }
char* UTF16_to_ASCII | ( | chars_t | ws | ) |
Converts string of widechar UTF-16LE encoding to ASCII encoding. The input string is not freed.
ws | characters to convert |
{ size_t len = wcslen( ws ); char* buffer = alloca( 4*(len+1) ); // assume one utf16 can expand to 4 bytes max char* buf = buffer; mbstate_t state; size_t nbytes; memset (&state, '\0', sizeof (state)); while (len>0) { nbytes = wcrtomb (buf, *ws, &state); buf += nbytes; len -= 1; ws += 1; } *buf = '\0'; len = buf-buffer+1; buf = ALLOC( len ); memcpy( buf, buffer, len ); return buf; }
chars_t ASCII_to_UTF16 | ( | const char * | s | ) |
Converts string of ASCII encoding to widechar UTF-16LE encoding.
s | characters to convert |
{ //printf("===%s===\n",s); size_t len = strlen(s); wchar_t *buffer = ALLOC( CHAR_SIZE*(len+1) ); wchar_t *buf = buffer; mbstate_t state; size_t nbytes; memset (&state, '\0', sizeof (state)); while (len>0) { nbytes = mbrtowc (buf, s, len, &state); buf++; len -= nbytes; s += nbytes; } *buf = L'\0'; len = CHAR_SIZE*(buf-buffer+1); buf = ALLOC( len ); memcpy( buf, buffer, len ); DEALLOC( buffer ); return buf; }
chars_t ASCII_to_ASCII | ( | const char * | s | ) |
s | characters to convert |
Converts string of ASCII encoding to ASCII. Actually does not covert anything. This function is used because it uses the ALLOC() macro which helps tracing memory allocation.
wfilename | file name |
filesize | file size |
Loads a text file which can be ASCII, multibyte UTF-8 or widechar UTF-16LE encoding. The size of the file is returned in filesize
so that the caller can append null character if needed.
{ FILE* file; // file stream void* buffer; // file buffer struct stat st_info; // file attributes // convert filename to multibyte char* filename = FILENAME(wfilename); errno = 0; // open file file = fopen( filename, "rb" ); #ifdef SAFEMODE if( errno ) return NULL; #endif //SAFEMODE // get file size fstat( fileno(file), &st_info ); *filesize = st_info.st_size; // file name is not needed any more #ifdef UNICODE_CHARS DEALLOC( filename ); #endif // allocate buffer buffer = ALLOC(*filesize+1); #ifdef SAFEMODE if( !buffer ) { fclose( file ); errno = ENOMEM; return NULL; } #endif //SAFEMODE // read file into the buffer if( *filesize && !fread(buffer,1,*filesize,file ) ) { #ifdef SAFEMODE DEALLOC( buffer ); return NULL; #endif //SAFEMODE } fclose( file ); #ifdef SAFEMODE if( errno ) { free( buffer ); return NULL; } #endif //SAFEMODE return buffer; }