/* * aegis - project change supervisor * Copyright (C) 1995, 1996, 1998, 1999 Peter Miller; * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. * * * MANIFEST: wide string manipulation functions * * A literal pool is maintained. Each string has a reference count. The * string stays in the literal pool for as long as it has a positive * reference count. To determine if a string is already in the literal pool, * linear dynamic hashing is used to guarantee an O(1) search. Making all equal * strings the same item in the literal pool means that string equality is * a pointer test, and thus very fast. */ #include #include #include #include #include #include #include #include #include #include #include #include static wstring_ty **hash_table; static wstr_hash_ty hash_modulus; static wstr_hash_ty hash_cutover; static wstr_hash_ty hash_cutover_mask; static wstr_hash_ty hash_cutover_split_mask; static wstr_hash_ty hash_split; static wstr_hash_ty hash_load; static int changed; #define MAX_HASH_LEN 20 /* * NAME * hash_generate - hash string to number * * SYNOPSIS * wstr_hash_ty hash_generate(wchar_t *s, size_t n); * * DESCRIPTION * The hash_generate function is used to make a number from a string. * * RETURNS * wstr_hash_ty - the magic number * * CAVEAT * Only the last MAX_HASH_LEN characters are used. * It is important that wstr_hash_ty be unsigned (int or long). */ static wstr_hash_ty hash_generate _((const wchar_t *, size_t)); static wstr_hash_ty hash_generate(s, n) const wchar_t *s; size_t n; { wstr_hash_ty retval; if (n > MAX_HASH_LEN) { s += n - MAX_HASH_LEN; n = MAX_HASH_LEN; } retval = 0; while (n > 0) { retval = (retval + (retval << 1)) ^ *s++; --n; } return retval; } /* * NAME * wstr_initialize - start up string table * * SYNOPSIS * void wstr_initialize(void); * * DESCRIPTION * The wstr_initialize function is used to create the hash table and * initialize it to empty. * * RETURNS * void * * CAVEAT * This function must be called before any other defined in this file. */ static void wstr_initialize _((void)); static void wstr_initialize() { wstr_hash_ty j; if (hash_modulus) return; hash_modulus = 1<<8; /* MUST be a power of 2 */ hash_cutover = hash_modulus; hash_split = hash_modulus - hash_cutover; hash_cutover_mask = hash_cutover - 1; hash_cutover_split_mask = (hash_cutover * 2) - 1; hash_load = 0; hash_table = (wstring_ty **)mem_alloc(hash_modulus * sizeof(wstring_ty *)); for (j = 0; j < hash_modulus; ++j) hash_table[j] = 0; } /* * NAME * split - reduce table loading * * SYNOPSIS * void split(void); * * DESCRIPTION * The split function is used to reduce the load factor on the hash table. * * RETURNS * void * * CAVEAT * A load factor of about 80% is suggested. */ static void split _((void)); static void split() { wstring_ty *p; wstring_ty *p2; wstr_hash_ty idx; /* * get the list to be split across buckets */ p = hash_table[hash_split]; hash_table[hash_split] = 0; /* * increase the modulus by one */ hash_modulus++; hash_table = mem_change_size(hash_table, hash_modulus * sizeof(wstring_ty *)); hash_table[hash_modulus - 1] = 0; hash_split = hash_modulus - hash_cutover; if (hash_split >= hash_cutover) { hash_cutover = hash_modulus; hash_split = 0; hash_cutover_mask = hash_cutover - 1; hash_cutover_split_mask = (hash_cutover * 2) - 1; } /* * now redistribute the list elements */ while (p) { p2 = p; p = p->wstr_next; idx = p2->wstr_hash & hash_cutover_mask; if (idx < hash_split) idx = p2->wstr_hash & hash_cutover_split_mask; p2->wstr_next = hash_table[idx]; hash_table[idx] = p2; } } /* * NAME * wstr_from_c - make string from C string * * SYNOPSIS * wstring_ty *wstr_from_c(char *); * * DESCRIPTION * The wstr_from_c function is used to make a string from a NUL * terminated C string. The conversion from multi-byte to wide * characters is done in the current locale. * * RETURNS * wstring_ty* - a pointer to a string in dynamic memory. Use * wstr_free when finished with. * * CAVEAT * The contents of the structure pointed to MUST NOT be altered. */ wstring_ty * wstr_from_c(s) const char *s; { return wstr_n_from_c(s, strlen(s)); } /* * NAME * wstr_from_wc - make string from a wide C string * * SYNOPSIS * wstring_ty *wstr_from_wc(wchar_t *); * * DESCRIPTION * The wstr_from_c function is used to make a string from a NUL * terminated wide C string. * * RETURNS * wstring_ty* - a pointer to a string in dynamic memory. Use * wstr_free when finished with. * * CAVEAT * The contents of the structure pointed to MUST NOT be altered. */ wstring_ty * wstr_from_wc(ws) const wchar_t *ws; { return wstr_n_from_wc(ws, wcslen(ws)); } /* * NAME * wstr_n_from_c - make string * * SYNOPSIS * wstring_ty *wstr_n_from_c(char *s, size_t n); * * DESCRIPTION * The wstr_n_from_c function is used to make a string from an * array of characters. No NUL terminator is assumed. The * conversion from muti-byte to wide characters is done in the * current locale. * * RETURNS * wstring_ty* - a pointer to a string in dynamic memory. Use * wstr_free when finished with. * * CAVEAT * The contents of the structure pointed to MUST NOT be altered. */ wstring_ty * wstr_n_from_c(s, length) const char *s; size_t length; { #if __STDC__ >= 1 static char escapes[] = "\aa\bb\ff\nn\rr\tt\vv"; #else static char escapes[] = "\bb\ff\nn\rr\tt"; #endif static wchar_t *buf; static size_t bufmax; size_t remainder; const char *ip; wchar_t *op; /* * Do the conversion "long hand". This is because some * implementations of the mbstowcs function barf when they see * invalid multi byte character sequences. This function * renders them as C escape sequences and keeps going. */ if (bufmax < length) { bufmax = length; /* the 4 is the longest escape sequence */ buf = mem_change_size(buf, bufmax * sizeof(wchar_t) * 4); } /* * change the locale to the native language default */ language_human(); /* * Reset the mbtowc internal state. */ mbtowc((wchar_t *)0, (char *)0, 0); /*lint !e418 */ /* * scan the string and extract the wide characters */ ip = s; op = buf; remainder = length; while (remainder > 0) { int n; n = mbtowc(op, ip, remainder); if (n == 0) break; if (n < 0) { char *esc; /* * Invalid multi byte sequence, replace the * first character with a C escape sequence. */ esc = strchr(escapes, *ip); if (esc) { *op++ = '\\'; *op++ = esc[1]; } else { *op++ = '\\'; *op++ = '0' + ((*ip >> 6) & 7); *op++ = '0' + ((*ip >> 3) & 7); *op++ = '0' + ( *ip & 7); } ++ip; --remainder; /* * The mbtowc function's internal state will now * be "error" or broken, or otherwise useless. * Reset it so that we can keep going. */ mbtowc((wchar_t *)0, (char *)0, 0); /*lint !e418 */ } else { /* * the one wchar_t used n chars */ ip += n; remainder -= n; ++op; } } /* * change the locale back to the C locale */ language_C(); /* * build the result from the image in ``buf'' */ return wstr_n_from_wc(buf, op - buf); } /* * NAME * wstr_to_mbs - wide string to multi-byte C string * * SYNOPSIS * void wstr_to_mbs(wstring_ty *s, char **rslt, size_t *rslt_len); * * DESCRIPTION * The wstr_to_mbs function convers a wide character string into a * multi-byte C string. The conversion is done in the current * locale. The result is NUL terminated, however the result length * does not include the NUL. * * CAVEAT * DO NOT free the result. The result will change between calls, * so copy it if you need to keep it. */ void wstr_to_mbs(s, result_p, result_length_p) const wstring_ty *s; char **result_p; size_t *result_length_p; { static char *buf; static size_t bufmax; int n; const wchar_t *ip; size_t remainder; char *op; size_t buflen; /* * For reasons I don't understand, the MB_CUR_MAX symbol (wich is * defined as a reference to __mb_cur_max) does not get resolved * at link time, despite being present in libc. It's easier to * just dodge the question. */ #ifdef __CYGWIN__ #undef MB_CUR_MAX #define MB_CUR_MAX 8 #endif /* * Do the conversion "long hand". This is because the wcstombs * function barfs when it sees an invalid wchar_t. This * function treats them literally and keeps going. */ buflen = (s->wstr_length + 1) * MB_CUR_MAX; if (buflen < s->wstr_length + 1) { /* * There are some wonderfully brain-dead implementatiosn * out there. The more stupid ones manage to make * MB_CUR_MAX be zero! */ buflen = s->wstr_length + 1; } if (buflen > bufmax) { bufmax = buflen; buf = mem_change_size(buf, bufmax); } /* * perform the conversion in the native language default */ language_human(); /* * The wctomb function has internal state. It needs to be reset. */ wctomb((char *)0, 0); /*lint !e418 */ ip = s->wstr_text; remainder = s->wstr_length; op = buf; while (remainder > 0) { n = wctomb(op, *ip); if (n <= 0) { /* * Copy the character literally. * Throw away anything that will not fit. */ *op++ = *ip++; if (!op[-1]) op[-1] = '?'; --remainder; /* * The wctomb function's internal state will now * be "error" or broken, or otherwise useless. * Reset it so that we can keep going. */ wctomb((char *)0, 0); /*lint !e418 */ } else { op += n; ++ip; --remainder; } } /* * The final NUL could require shift state end characters, * meaning that n could be more than 1. */ n = wctomb(op, (wchar_t)0); if (n <= 0) *op = 0; else { op += n - 1; assert(*op == 0); } /* * restore the locale to the C locale */ language_C(); /* * set the output side effects */ *result_p = buf; *result_length_p = op - buf; } /* * NAME * wstr_n_from_wc - make string * * SYNOPSIS * wstring_ty *wstr_n_from_wc(wchar_t *s, size_t n); * * DESCRIPTION * The wstr_n_from_c function is used to make a string from an * array of wide characters. No NUL terminator is assumed. * * RETURNS * wstring_ty* - a pointer to a string in dynamic memory. Use * wstr_free when finished with. * * CAVEAT * The contents of the structure pointed to MUST NOT be altered. */ wstring_ty * wstr_n_from_wc(s, length) const wchar_t *s; size_t length; { wstr_hash_ty hash; wstr_hash_ty idx; wstring_ty *p; if (!hash_modulus) wstr_initialize(); hash = hash_generate(s, length); idx = hash & hash_cutover_mask; if (idx < hash_split) idx = hash & hash_cutover_split_mask; for (p = hash_table[idx]; p; p = p->wstr_next) { if ( p->wstr_hash == hash && p->wstr_length == length && !memcmp(p->wstr_text, s, length * sizeof(wchar_t)) ) { p->wstr_references++; return p; } } p = (wstring_ty *)mem_alloc(sizeof(wstring_ty) + length * sizeof(wchar_t)); p->wstr_hash = hash; p->wstr_length = length; p->wstr_references = 1; p->wstr_next = hash_table[idx]; hash_table[idx] = p; memcpy(p->wstr_text, s, length * sizeof(wchar_t)); p->wstr_text[length] = 0; hash_load++; while (hash_load * 10 > hash_modulus * 8) split(); ++changed; return p; } /* * NAME * wstr_copy - make a copy of a string * * SYNOPSIS * wstring_ty *wstr_copy(wstring_ty *s); * * DESCRIPTION * The wstr_copy function is used to make a copy of a string. * * RETURNS * wstring_ty* - a pointer to a string in dynamic memory. Use wstr_free when * finished with. * * CAVEAT * The contents of the structure pointed to MUST NOT be altered. */ wstring_ty * wstr_copy(s) wstring_ty *s; { s->wstr_references++; return s; } /* * NAME * wstr_free - release a string * * SYNOPSIS * void wstr_free(wstring_ty *s); * * DESCRIPTION * The wstr_free function is used to indicate that a string hash been * finished with. * * RETURNS * void * * CAVEAT * This is the only way to release strings DO NOT use the free function. */ void wstr_free(s) wstring_ty *s; { wstr_hash_ty idx; wstring_ty **spp; if (!s) return; if (s->wstr_references > 1) { s->wstr_references--; return; } ++changed; /* * find the hash bucket it was in, * and remove it */ idx = s->wstr_hash & hash_cutover_mask; if (idx < hash_split) idx = s->wstr_hash & hash_cutover_split_mask; for (spp = &hash_table[idx]; *spp; spp = &(*spp)->wstr_next) { if (*spp == s) { *spp = s->wstr_next; free(s); --hash_load; return; } } /* * should never reach here! */ fatal_raw("attempted to free non-existent wstring (bug)"); } /* * NAME * wstr_catenate - join two strings * * SYNOPSIS * wstring_ty *wstr_catenate(wstring_ty *, wstring_ty *); * * DESCRIPTION * The wstr_catenate function is used to concatenate two strings to form a * new string. * * RETURNS * wstring_ty* - a pointer to a string in dynamic memory. Use wstr_free when * finished with. * * CAVEAT * The contents of the structure pointed to MUST NOT be altered. */ wstring_ty * wstr_catenate(s1, s2) const wstring_ty *s1; const wstring_ty *s2; { static wchar_t *tmp; static size_t tmplen; wstring_ty *s; size_t length; length = s1->wstr_length + s2->wstr_length; if (length > tmplen) { tmplen = length; tmp = mem_change_size(tmp, tmplen * sizeof(wchar_t)); } memcpy(tmp, s1->wstr_text, s1->wstr_length * sizeof(wchar_t)); memcpy(tmp + s1->wstr_length, s2->wstr_text, s2->wstr_length * sizeof(wchar_t)); s = wstr_n_from_wc(tmp, length); return s; } /* * NAME * wstr_cat_three - join three strings * * SYNOPSIS * wstring_ty *wstr_cat_three(wstring_ty *, wstring_ty *, wstring_ty *); * * DESCRIPTION * The wstr_cat_three function is used to concatenate three strings to form * a new string. * * RETURNS * wstring_ty* - a pointer to a string in dynamic memory. Use wstr_free when * finished with. * * CAVEAT * The contents of the structure pointed to MUST NOT be altered. */ wstring_ty * wstr_cat_three(s1, s2, s3) const wstring_ty *s1; const wstring_ty *s2; const wstring_ty *s3; { static wchar_t *tmp; static size_t tmplen; wstring_ty *s; size_t length; length = s1->wstr_length + s2->wstr_length + s3->wstr_length; if (tmplen < length) { tmplen = length; tmp = mem_change_size(tmp, tmplen * sizeof(wchar_t)); } memcpy(tmp, s1->wstr_text, s1->wstr_length * sizeof(wchar_t)); memcpy ( tmp + s1->wstr_length, s2->wstr_text, s2->wstr_length * sizeof(wchar_t) ); memcpy ( tmp + s1->wstr_length + s2->wstr_length, s3->wstr_text, s3->wstr_length * sizeof(wchar_t) ); s = wstr_n_from_wc(tmp, length); return s; } wstring_ty * wstr_capitalize(ws) const wstring_ty *ws; { static wchar_t *buffer; static size_t buflen; size_t j; int prev_was_alpha; if (ws->wstr_length > buflen) { buflen = ws->wstr_length; buffer = mem_change_size(buffer, buflen * sizeof(wchar_t)); } language_human(); prev_was_alpha = 0; for (j = 0; j < ws->wstr_length; ++j) { wchar_t c; c = ws->wstr_text[j]; if (iswlower(c)) { if (!prev_was_alpha) c = towupper(c); prev_was_alpha = 1; } else if (iswupper(c)) { if (prev_was_alpha) c = towlower(c); prev_was_alpha = 1; } else prev_was_alpha = 0; buffer[j] = c; } language_C(); return wstr_n_from_wc(buffer, ws->wstr_length); } wstring_ty * wstr_to_upper(ws) const wstring_ty *ws; { static wchar_t *buffer; static size_t buflen; size_t j; if (ws->wstr_length > buflen) { buflen = ws->wstr_length; buffer = mem_change_size(buffer, buflen * sizeof(wchar_t)); } language_human(); for (j = 0; j < ws->wstr_length; ++j) { wchar_t c; c = ws->wstr_text[j]; if (iswlower(c)) c = towupper(c); buffer[j] = c; } language_C(); return wstr_n_from_wc(buffer, ws->wstr_length); } wstring_ty * wstr_to_lower(ws) const wstring_ty *ws; { static wchar_t *buffer; static size_t buflen; size_t j; if (ws->wstr_length > buflen) { buflen = ws->wstr_length; buffer = mem_change_size(buffer, buflen * sizeof(wchar_t)); } language_human(); for (j = 0; j < ws->wstr_length; ++j) { wchar_t c; c = ws->wstr_text[j]; if (iswupper(c)) c = towlower(c); buffer[j] = c; } language_C(); return wstr_n_from_wc(buffer, ws->wstr_length); } wstring_ty * wstr_to_ident(ws) const wstring_ty *ws; { static wchar_t *buffer; static size_t buflen; size_t j; if (ws->wstr_length == 0) return wstr_from_c("_"); if (ws->wstr_length > buflen) { buflen = ws->wstr_length; buffer = mem_change_size(buffer, buflen * sizeof(wchar_t)); } language_human(); for (j = 0; j < ws->wstr_length; ++j) { wchar_t c; c = ws->wstr_text[j]; if (!iswalnum(c)) c = '_'; buffer[j] = c; } if (iswdigit(buffer[0])) buffer[0] = '_'; language_C(); return wstr_n_from_wc(buffer, ws->wstr_length); } /* * NAME * wstr_equal - test equality of strings * * SYNOPSIS * int wstr_equal(wstring_ty *, wstring_ty *); * * DESCRIPTION * The wstr_equal function is used to test if two strings are equal. * * RETURNS * int; zero if the strings are not equal, nonzero if the strings are * equal. * * CAVEAT * This function is implemented as a macro in strings.h */ #ifndef wstr_equal int wstr_equal(s1, s2) const wstring_ty *s1; const wstring_ty *s2; { return (s1 == s2); } #endif wstring_ty * str_to_wstr(s) const string_ty *s; { return wstr_n_from_c(s->str_text, s->str_length); } string_ty * wstr_to_str(ws) const wstring_ty *ws; { char *text; size_t length; wstr_to_mbs(ws, &text, &length); return str_n_from_c(text, length); }