//
// aegis - project change supervisor
// Copyright (C) 1995, 1996, 1998, 1999, 2002-2006, 2008 Peter Miller
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see
// .
//
//
// A literal pool is maintained. Each string has a reference count. The
// string stays in the literal pool for as long as it has a positive
// reference count. To determine if a string is already in the literal pool,
// linear dynamic hashing is used to guarantee an O(1) search. Making all equal
// strings the same item in the literal pool means that string equality is
// a pointer test, and thus very fast.
//
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
static wstring_ty **hash_table;
static wstr_hash_ty hash_modulus;
static wstr_hash_ty hash_mask;
static wstr_hash_ty hash_load;
#define MAX_HASH_LEN 20
//
// NAME
// hash_generate - hash string to number
//
// SYNOPSIS
// wstr_hash_ty hash_generate(wchar_t *s, size_t n);
//
// DESCRIPTION
// The hash_generate function is used to make a number from a string.
//
// RETURNS
// wstr_hash_ty - the magic number
//
// CAVEAT
// Only the last MAX_HASH_LEN characters are used.
// It is important that wstr_hash_ty be unsigned (int or long).
//
static wstr_hash_ty
hash_generate(const wchar_t *s, size_t n)
{
wstr_hash_ty retval;
if (n > MAX_HASH_LEN)
{
s += n - MAX_HASH_LEN;
n = MAX_HASH_LEN;
}
retval = 0;
while (n > 0)
{
retval = (retval + (retval << 1)) ^ *s++;
--n;
}
return retval;
}
//
// NAME
// wstr_initialize - start up string table
//
// SYNOPSIS
// void wstr_initialize(void);
//
// DESCRIPTION
// The wstr_initialize function is used to create the hash table and
// initialize it to empty.
//
// RETURNS
// void
//
// CAVEAT
// This function must be called before any other defined in this file.
//
static void
wstr_initialize(void)
{
if (hash_modulus)
return;
trace(("%s\n", __PRETTY_FUNCTION__));
hash_modulus = 1 << 8; // MUST be a power of 2
hash_mask = hash_modulus - 1;
hash_load = 0;
hash_table = new wstring_ty * [hash_modulus];
for (wstr_hash_ty j = 0; j < hash_modulus; ++j)
hash_table[j] = 0;
}
void
wstr_release(void)
{
delete[] hash_table;
hash_table = 0;
}
//
// NAME
// split - reduce table loading
//
// SYNOPSIS
// void split(void);
//
// DESCRIPTION
// The split function is used to reduce the load factor on the hash table.
//
// RETURNS
// void
//
// CAVEAT
// A load factor of about 80% is suggested.
//
static void
split(void)
{
trace(("split()\n{\n"));
//
// double the modulus
//
// This is subtle. If we only increase the modulus by one, the
// load always hovers around 80%, so we have to do a split for
// every insert. I.e. the malloc burden os O(n) for the lifetime of
// the program. BUT if we double the modulus, the length of time
// until the next split also doubles, making the probablity of a
// split halve, and sigma(2**-n)=1, so the malloc burden becomes O(1)
// for the lifetime of the program.
//
wstr_hash_ty new_hash_modulus = hash_modulus * 2;
wstring_ty **new_hash_table = new wstring_ty * [new_hash_modulus];
wstr_hash_ty new_hash_mask = new_hash_modulus - 1;
//
// now redistribute the list elements
//
for (wstr_hash_ty idx = 0; idx < hash_modulus; ++idx)
{
new_hash_table[idx] = 0;
new_hash_table[idx + hash_modulus] = 0;
wstring_ty *p = hash_table[idx];
while (p)
{
wstring_ty *p2 = p;
p = p->wstr_next;
assert((p2->wstr_hash & hash_mask) == idx);
wstr_hash_ty new_idx = p2->wstr_hash & new_hash_mask;
p2->wstr_next = new_hash_table[new_idx];
new_hash_table[new_idx] = p2;
}
}
delete [] hash_table;
hash_table = new_hash_table;
hash_modulus = new_hash_modulus;
hash_mask = new_hash_mask;
trace(("}\n"));
}
//
// NAME
// wstr_from_c - make string from C string
//
// SYNOPSIS
// wstring_ty *wstr_from_c(char *);
//
// DESCRIPTION
// The wstr_from_c function is used to make a string from a NUL
// terminated C string. The conversion from multi-byte to wide
// characters is done in the current locale.
//
// RETURNS
// wstring_ty* - a pointer to a string in dynamic memory. Use
// wstr_free when finished with.
//
// CAVEAT
// The contents of the structure pointed to MUST NOT be altered.
//
wstring_ty *
wstr_from_c(const char *s)
{
return wstr_n_from_c(s, strlen(s));
}
//
// NAME
// wstr_from_wc - make string from a wide C string
//
// SYNOPSIS
// wstring_ty *wstr_from_wc(wchar_t *);
//
// DESCRIPTION
// The wstr_from_c function is used to make a string from a NUL
// terminated wide C string.
//
// RETURNS
// wstring_ty* - a pointer to a string in dynamic memory. Use
// wstr_free when finished with.
//
// CAVEAT
// The contents of the structure pointed to MUST NOT be altered.
//
wstring_ty *
wstr_from_wc(const wchar_t *s)
{
return wstr_n_from_wc(s, wcslen(s));
}
//
// NAME
// wstr_n_from_c - make string
//
// SYNOPSIS
// wstring_ty *wstr_n_from_c(char *s, size_t n);
//
// DESCRIPTION
// The wstr_n_from_c function is used to make a string from an
// array of characters. No NUL terminator is assumed. The
// conversion from muti-byte to wide characters is done in the
// current locale.
//
// RETURNS
// wstring_ty* - a pointer to a string in dynamic memory. Use
// wstr_free when finished with.
//
// CAVEAT
// The contents of the structure pointed to MUST NOT be altered.
//
wstring_ty *
wstr_n_from_c(const char *s, size_t length)
{
static wchar_t *buf;
static size_t bufmax;
if (!s && !length)
{
delete [] buf;
buf = 0;
bufmax = 0;
return 0;
}
//
// Do the conversion "long hand". This is because some
// implementations of the mbstowcs function barf when they see
// invalid multi byte character sequences. This function
// renders them as C escape sequences and keeps going.
//
if (bufmax < length)
{
for (;;)
{
bufmax = bufmax * 2 + 8;
if (length <= bufmax)
break;
}
delete [] buf;
// the 4 is the longest escape sequence
buf = new wchar_t [bufmax * sizeof(wchar_t) * 4];
}
//
// change the locale to the native language default
//
language_human();
//
// Reset the mbtowc internal state.
//
mbtowc((wchar_t *)0, (char *)0, 0); //lint !e418
//
// scan the string and extract the wide characters
//
const char *ip = s;
wchar_t *op = buf;
size_t remainder = length;
while (remainder > 0)
{
int n = mbtowc(op, ip, remainder);
if (n == 0)
break;
if (n < 0)
{
//
// Invalid multi byte sequence, replace the
// first character with a C escape sequence.
//
static const char escapes[] = "\aa\bb\ff\nn\rr\tt\vv";
const char *esc = strchr(escapes, *ip);
if (esc)
{
*op++ = '\\';
*op++ = esc[1];
}
else
{
*op++ = '\\';
*op++ = '0' + ((*ip >> 6) & 3);
*op++ = '0' + ((*ip >> 3) & 7);
*op++ = '0' + (*ip & 7);
}
++ip;
--remainder;
//
// The mbtowc function's internal state will now
// be "error" or broken, or otherwise useless.
// Reset it so that we can keep going.
//
mbtowc((wchar_t *)0, (char *)0, 0); //lint !e418
}
else
{
//
// the one wchar_t used n chars
//
ip += n;
remainder -= n;
++op;
}
}
//
// change the locale back to the C locale
//
language_C();
//
// build the result from the image in "buf"
//
return wstr_n_from_wc(buf, op - buf);
}
//
// NAME
// wstr_to_mbs - wide string to multi-byte C string
//
// SYNOPSIS
// void wstr_to_mbs(wstring_ty *s, char **rslt, size_t *rslt_len);
//
// DESCRIPTION
// The wstr_to_mbs function convers a wide character string into a
// multi-byte C string. The conversion is done in the current
// locale. The result is NUL terminated, however the result length
// does not include the NUL.
//
// CAVEAT
// DO NOT free the result. The result will change between calls,
// so copy it if you need to keep it.
//
void
wstr_to_mbs(const wstring_ty *s, char **result_p, size_t *result_length_p)
{
static char *buf;
static size_t bufmax;
int n;
const wchar_t *ip;
size_t remainder;
char *op;
size_t buflen;
trace(("%s\n", __PRETTY_FUNCTION__));
//
// For reasons I don't understand, the MB_CUR_MAX symbol (wich is
// defined as a reference to __mb_cur_max) does not get resolved
// at link time, despite being present in libc. It's easier to
// just dodge the question.
//
#ifdef __CYGWIN__
#undef MB_CUR_MAX
#define MB_CUR_MAX 8
#endif
//
// Do the conversion "long hand". This is because the wcstombs
// function barfs when it sees an invalid wchar_t. This
// function treats them literally and keeps going.
//
buflen = (s->wstr_length + 1) * MB_CUR_MAX;
if (buflen < s->wstr_length + 1)
{
//
// There are some wonderfully brain-dead implementations
// out there. The more stupid ones manage to make
// MB_CUR_MAX be zero!
//
buflen = s->wstr_length + 1;
}
if (buflen > bufmax)
{
for (;;)
{
bufmax = bufmax * 2 + 8;
if (buflen <= bufmax)
break;
}
delete [] buf;
buf = new char [bufmax];
}
//
// perform the conversion in the native language default
//
language_human();
//
// The wctomb function has internal state. It needs to be reset.
//
wctomb((char *)0, 0); //lint !e418
ip = s->wstr_text;
remainder = s->wstr_length;
op = buf;
while (remainder > 0)
{
n = wctomb(op, *ip);
if (n <= 0)
{
//
// Copy the character literally.
// Throw away anything that will not fit.
//
*op++ = *ip++;
if (!op[-1])
op[-1] = '?';
--remainder;
//
// The wctomb function's internal state will now
// be "error" or broken, or otherwise useless.
// Reset it so that we can keep going.
//
wctomb((char *)0, 0); //lint !e418
}
else
{
op += n;
++ip;
--remainder;
}
}
//
// The final NUL could require shift state end characters,
// meaning that n could be more than 1.
//
n = wctomb(op, (wchar_t)0);
if (n <= 0)
*op = 0;
else
{
op += n - 1;
assert(*op == 0);
}
//
// restore the locale to the C locale
//
language_C();
//
// set the output side effects
//
*result_p = buf;
*result_length_p = op - buf;
}
//
// NAME
// wstr_n_from_wc - make string
//
// SYNOPSIS
// wstring_ty *wstr_n_from_wc(wchar_t *s, size_t n);
//
// DESCRIPTION
// The wstr_n_from_c function is used to make a string from an
// array of wide characters. No NUL terminator is assumed.
//
// RETURNS
// wstring_ty* - a pointer to a string in dynamic memory. Use
// wstr_free when finished with.
//
// CAVEAT
// The contents of the structure pointed to MUST NOT be altered.
//
wstring_ty *
wstr_n_from_wc(const wchar_t *s, size_t length)
{
trace(("%s\n", __PRETTY_FUNCTION__));
wstring_ty *p;
if (!hash_modulus)
wstr_initialize();
wstr_hash_ty hash = hash_generate(s, length);
wstr_hash_ty idx = hash & hash_mask;
for (p = hash_table[idx]; p; p = p->wstr_next)
{
if
(
p->wstr_hash == hash
&&
p->wstr_length == length
&&
!memcmp(p->wstr_text, s, length * sizeof(wchar_t))
)
{
p->wstr_references++;
return p;
}
}
p = (wstring_ty *)mem_alloc(sizeof(wstring_ty) + length * sizeof(wchar_t));
p->wstr_hash = hash;
p->wstr_length = length;
p->wstr_references = 1;
p->wstr_next = hash_table[idx];
hash_table[idx] = p;
memcpy(p->wstr_text, s, length * sizeof(wchar_t));
p->wstr_text[length] = 0;
hash_load++;
while (hash_load * 10 > hash_modulus * 8)
split();
return p;
}
//
// NAME
// wstr_copy - make a copy of a string
//
// SYNOPSIS
// wstring_ty *wstr_copy(wstring_ty *s);
//
// DESCRIPTION
// The wstr_copy function is used to make a copy of a string.
//
// RETURNS
// wstring_ty* - a pointer to a string in dynamic memory.
// Use wstr_free when finished with.
//
// CAVEAT
// The contents of the structure pointed to MUST NOT be altered.
//
wstring_ty *
wstr_copy(wstring_ty *s)
{
s->wstr_references++;
return s;
}
//
// NAME
// wstr_free - release a string
//
// SYNOPSIS
// void wstr_free(wstring_ty *s);
//
// DESCRIPTION
// The wstr_free function is used to indicate that a string hash been
// finished with.
//
// RETURNS
// void
//
// CAVEAT
// This is the only way to release strings DO NOT use the free function.
//
void
wstr_free(wstring_ty *s)
{
wstring_ty **spp;
if (!s)
return;
if (s->wstr_references > 1)
{
s->wstr_references--;
return;
}
//
// find the hash bucket it was in,
// and remove it
//
wstr_hash_ty idx = s->wstr_hash & hash_mask;
for (spp = &hash_table[idx]; *spp; spp = &(*spp)->wstr_next)
{
if (*spp == s)
{
*spp = s->wstr_next;
mem_free(s);
--hash_load;
return;
}
}
//
// should never reach here!
//
fatal_raw("attempted to free non-existent wstring (bug)");
}
//
// NAME
// wstr_catenate - join two strings
//
// SYNOPSIS
// wstring_ty *wstr_catenate(wstring_ty *, wstring_ty *);
//
// DESCRIPTION
// The wstr_catenate function is used to concatenate two strings to form a
// new string.
//
// RETURNS
// wstring_ty* - a pointer to a string in dynamic memory.
// Use wstr_free when finished with.
//
// CAVEAT
// The contents of the structure pointed to MUST NOT be altered.
//
wstring_ty *
wstr_catenate(const wstring_ty *s1, const wstring_ty *s2)
{
static wchar_t *tmp;
static size_t tmplen;
wstring_ty *s;
size_t length;
length = s1->wstr_length + s2->wstr_length;
if (length > tmplen)
{
for (;;)
{
tmplen = tmplen * 2 + 8;
if (length <= tmplen)
break;
}
delete [] tmp;
tmp = new wchar_t [tmplen];
}
memcpy(tmp, s1->wstr_text, s1->wstr_length * sizeof(wchar_t));
memcpy
(
tmp + s1->wstr_length,
s2->wstr_text,
s2->wstr_length * sizeof(wchar_t)
);
s = wstr_n_from_wc(tmp, length);
return s;
}
//
// NAME
// wstr_cat_three - join three strings
//
// SYNOPSIS
// wstring_ty *wstr_cat_three(wstring_ty *, wstring_ty *, wstring_ty *);
//
// DESCRIPTION
// The wstr_cat_three function is used to concatenate three strings to form
// a new string.
//
// RETURNS
// wstring_ty* - a pointer to a string in dynamic memory.
// Use wstr_free when finished with.
//
// CAVEAT
// The contents of the structure pointed to MUST NOT be altered.
//
wstring_ty *
wstr_cat_three(const wstring_ty *s1, const wstring_ty *s2, const wstring_ty *s3)
{
static wchar_t *tmp;
static size_t tmplen;
wstring_ty *s;
size_t length;
length = s1->wstr_length + s2->wstr_length + s3->wstr_length;
if (tmplen < length)
{
for (;;)
{
tmplen = tmplen * 2 + 8;
if (length <= tmplen)
break;
}
delete [] tmp;
tmp = new wchar_t [tmplen];
}
memcpy(tmp, s1->wstr_text, s1->wstr_length * sizeof(wchar_t));
memcpy
(
tmp + s1->wstr_length,
s2->wstr_text,
s2->wstr_length * sizeof(wchar_t)
);
memcpy
(
tmp + s1->wstr_length + s2->wstr_length,
s3->wstr_text,
s3->wstr_length * sizeof(wchar_t)
);
s = wstr_n_from_wc(tmp, length);
return s;
}
wstring_ty *
wstr_capitalize(const wstring_ty *wsp)
{
static wchar_t *buffer;
static size_t buflen;
size_t j;
int prev_was_alpha;
if (wsp->wstr_length > buflen)
{
for (;;)
{
buflen = buflen * 2 + 8;
if (wsp->wstr_length <= buflen)
break;
}
delete [] buffer;
buffer = new wchar_t [buflen];
}
language_human();
prev_was_alpha = 0;
for (j = 0; j < wsp->wstr_length; ++j)
{
wchar_t c;
c = wsp->wstr_text[j];
if (iswlower(c))
{
if (!prev_was_alpha)
c = towupper(c);
prev_was_alpha = 1;
}
else if (iswupper(c))
{
if (prev_was_alpha)
c = towlower(c);
prev_was_alpha = 1;
}
else
prev_was_alpha = 0;
buffer[j] = c;
}
language_C();
return wstr_n_from_wc(buffer, wsp->wstr_length);
}
wstring_ty *
wstr_to_upper(const wstring_ty *wsp)
{
static wchar_t *buffer;
static size_t buflen;
size_t j;
if (wsp->wstr_length > buflen)
{
for (;;)
{
buflen = buflen * 2 + 8;
if (wsp->wstr_length <= buflen)
break;
}
delete [] buffer;
buffer = new wchar_t [buflen];
}
language_human();
for (j = 0; j < wsp->wstr_length; ++j)
{
wchar_t c;
c = wsp->wstr_text[j];
if (iswlower(c))
c = towupper(c);
buffer[j] = c;
}
language_C();
return wstr_n_from_wc(buffer, wsp->wstr_length);
}
wstring_ty *
wstr_to_lower(const wstring_ty *wsp)
{
static wchar_t *buffer;
static size_t buflen;
size_t j;
if (wsp->wstr_length > buflen)
{
for (;;)
{
buflen = buflen * 2 + 8;
if (wsp->wstr_length <= buflen)
break;
}
delete [] buffer;
buffer = new wchar_t [buflen];
}
language_human();
for (j = 0; j < wsp->wstr_length; ++j)
{
wchar_t c;
c = wsp->wstr_text[j];
if (iswupper(c))
c = towlower(c);
buffer[j] = c;
}
language_C();
return wstr_n_from_wc(buffer, wsp->wstr_length);
}
wstring_ty *
wstr_to_ident(const wstring_ty *wsp)
{
static wchar_t *buffer;
static size_t buflen;
size_t j;
if (wsp->wstr_length == 0)
return wstr_from_c("_");
if (wsp->wstr_length > buflen)
{
for (;;)
{
buflen = buflen * 2 + 8;
if (wsp->wstr_length <= buflen)
break;
}
delete [] buffer;
buffer = new wchar_t [buflen];
}
language_human();
for (j = 0; j < wsp->wstr_length; ++j)
{
wchar_t c;
c = wsp->wstr_text[j];
if (!iswalnum(c))
c = '_';
buffer[j] = c;
}
if (iswdigit(buffer[0]))
buffer[0] = '_';
language_C();
return wstr_n_from_wc(buffer, wsp->wstr_length);
}
//
// NAME
// wstr_equal - test equality of strings
//
// SYNOPSIS
// int wstr_equal(wstring_ty *, wstring_ty *);
//
// DESCRIPTION
// The wstr_equal function is used to test if two strings are equal.
//
// RETURNS
// int; zero if the strings are not equal, nonzero if the strings are
// equal.
//
// CAVEAT
// This function is implemented as a macro in strings.h
//
#ifndef wstr_equal
int
wstr_equal(const wstring_ty *s1, const wstring_ty *s2)
{
return (s1 == s2);
}
#endif
wstring_ty *
str_to_wstr(const string_ty *s)
{
return wstr_n_from_c(s->str_text, s->str_length);
}
string_ty *
wstr_to_str(const wstring_ty *wsp)
{
char *text;
size_t length;
wstr_to_mbs(wsp, &text, &length);
return str_n_from_c(text, length);
}