//
// aegis - project change supervisor
// Copyright (C) 1995-2014 Peter Miller
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published
// by the Free Software Foundation; either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
// //
// A literal pool is maintained.  Each string has a reference count.  The
// string stays in the literal pool for as long as it has a positive
// reference count.  To determine if a string is already in the literal pool,
// linear dynamic hashing is used to guarantee an O(1) search.  Making all equal
// strings the same item in the literal pool means that string equality is
// a pointer test, and thus very fast.
//

#include <common/ac/assert.h>
#include <common/ac/stddef.h>
#include <common/ac/stdio.h>
#include <common/ac/stdlib.h>
#include <common/ac/string.h>
#include <common/ac/stdarg.h>
#include <common/ac/wchar.h>
#include <common/ac/wctype.h>

#include <common/error.h>
#include <common/language.h>
#include <common/mem.h>
#include <common/multibyte.h>
#include <common/str.h>
#include <common/trace.h>
#include <common/wstr.h>


static wstring_ty **hash_table;
static wstr_hash_ty hash_modulus;
static wstr_hash_ty hash_mask;
static wstr_hash_ty hash_load;

#define MAX_HASH_LEN 20


//
// NAME
//      hash_generate - hash string to number
//
// SYNOPSIS
//      wstr_hash_ty hash_generate(wchar_t *s, size_t n);
//
// DESCRIPTION
//      The hash_generate function is used to make a number from a string.
//
// RETURNS
//      wstr_hash_ty - the magic number
//
// CAVEAT
//      Only the last MAX_HASH_LEN characters are used.
//      It is important that wstr_hash_ty be unsigned (int or long).
//

static wstr_hash_ty
hash_generate(const wchar_t *s, size_t n)
{
    wstr_hash_ty    retval;

    if (n > MAX_HASH_LEN)
    {
        s += n - MAX_HASH_LEN;
        n = MAX_HASH_LEN;
    }

    retval = 0;
    while (n > 0)
    {
        retval = (retval + (retval << 1)) ^ *s++;
        --n;
    }
    return retval;
}


//
// NAME
//      wstr_initialize - start up string table
//
// SYNOPSIS
//      void wstr_initialize(void);
//
// DESCRIPTION
//      The wstr_initialize function is used to create the hash table and
//      initialize it to empty.
//
// RETURNS
//      void
//
// CAVEAT
//      This function must be called before any other defined in this file.
//

static void
wstr_initialize(void)
{
    if (hash_modulus)
        return;
    trace(("%s\n", __PRETTY_FUNCTION__));
    hash_modulus = 1 << 8;      // MUST be a power of 2
    hash_mask = hash_modulus - 1;
    hash_load = 0;
    hash_table = new wstring_ty * [hash_modulus];
    for (wstr_hash_ty j = 0; j < hash_modulus; ++j)
        hash_table[j] = 0;
}

void
wstr_release(void)
{
    delete[] hash_table;
    hash_table = 0;
}

//
// NAME
//      split - reduce table loading
//
// SYNOPSIS
//      void split(void);
//
// DESCRIPTION
//      The split function is used to reduce the load factor on the hash table.
//
// RETURNS
//      void
//
// CAVEAT
//      A load factor of about 80% is suggested.
//

static void
split(void)
{
    trace(("split()\n{\n"));
    //
    // double the modulus
    //
    // This is subtle.  If we only increase the modulus by one, the
    // load always hovers around 80%, so we have to do a split for
    // every insert.  I.e. the malloc burden os O(n) for the lifetime of
    // the program.  BUT if we double the modulus, the length of time
    // until the next split also doubles, making the probablity of a
    // split halve, and sigma(2**-n)=1, so the malloc burden becomes O(1)
    // for the lifetime of the program.
    //
    wstr_hash_ty new_hash_modulus = hash_modulus * 2;
    wstring_ty **new_hash_table = new wstring_ty * [new_hash_modulus];
    wstr_hash_ty new_hash_mask = new_hash_modulus - 1;

    //
    // now redistribute the list elements
    //
    for (wstr_hash_ty idx = 0; idx < hash_modulus; ++idx)
    {
        new_hash_table[idx] = 0;
        new_hash_table[idx + hash_modulus] = 0;
        wstring_ty *p = hash_table[idx];
        while (p)
        {
            wstring_ty *p2 = p;
            p = p->wstr_next;
            assert((p2->wstr_hash & hash_mask) == idx);
            wstr_hash_ty new_idx = p2->wstr_hash & new_hash_mask;
            p2->wstr_next = new_hash_table[new_idx];
            new_hash_table[new_idx] = p2;
        }
    }
    delete [] hash_table;
    hash_table = new_hash_table;
    hash_modulus = new_hash_modulus;
    hash_mask = new_hash_mask;
    trace(("}\n"));
}


//
// NAME
//      wstr_from_c - make string from C string
//
// SYNOPSIS
//      wstring_ty *wstr_from_c(char *);
//
// DESCRIPTION
//      The wstr_from_c function is used to make a string from a NUL
//      terminated C string.  The conversion from multi-byte to wide
//      characters is done in the current locale.
//
// RETURNS
//      wstring_ty* - a pointer to a string in dynamic memory.  Use
//      wstr_free when finished with.
//
// CAVEAT
//      The contents of the structure pointed to MUST NOT be altered.
//

wstring_ty *
wstr_from_c(const char *s)
{
    return wstr_n_from_c(s, strlen(s));
}


//
// NAME
//      wstr_from_wc - make string from a wide C string
//
// SYNOPSIS
//      wstring_ty *wstr_from_wc(wchar_t *);
//
// DESCRIPTION
//      The wstr_from_c function is used to make a string from a NUL
//      terminated wide C string.
//
// RETURNS
//      wstring_ty* - a pointer to a string in dynamic memory.  Use
//      wstr_free when finished with.
//
// CAVEAT
//      The contents of the structure pointed to MUST NOT be altered.
//

wstring_ty *
wstr_from_wc(const wchar_t *s)
{
    return wstr_n_from_wc(s, wcslen(s));
}


//
// NAME
//      wstr_n_from_c - make string
//
// SYNOPSIS
//      wstring_ty *wstr_n_from_c(char *s, size_t n);
//
// DESCRIPTION
//      The wstr_n_from_c function is used to make a string from an
//      array of characters.  No NUL terminator is assumed.  The
//      conversion from muti-byte to wide characters is done in the
//      current locale.
//
// RETURNS
//      wstring_ty* - a pointer to a string in dynamic memory.  Use
//      wstr_free when finished with.
//
// CAVEAT
//      The contents of the structure pointed to MUST NOT be altered.
//

wstring_ty *
wstr_n_from_c(const char *s, size_t length)
{
    static wchar_t *buf;
    static size_t bufmax;

    if (!s && !length)
    {
        delete [] buf;
        buf = 0;
        bufmax = 0;
        return 0;
    }
    //
    // Do the conversion "long hand".  This is because some
    // implementations of the mbstowcs function barf when they see
    // invalid multi byte character sequences.  This function
    // renders them as C escape sequences and keeps going.
    //
    if (bufmax < length)
    {
        for (;;)
        {
            bufmax = bufmax * 2 + 8;
            if (length <= bufmax)
                break;
        }
        delete [] buf;
        // the 4 is the longest escape sequence
        buf = new wchar_t [bufmax * sizeof(wchar_t) * 4];
    }

    //
    // change the locale to the native language default
    //
    language_human();

    //
    // Reset the mbtowc internal state.
    //
    mbtowc((wchar_t *)0, (char *)0, 0);                         //lint !e418

    //
    // scan the string and extract the wide characters
    //
    const char *ip = s;
    wchar_t *op = buf;
    size_t remainder = length;
    while (remainder > 0)
    {
        int n = mbtowc(op, ip, remainder);
        if (n == 0)
            break;
        if (n < 0)
        {
            //
            // Invalid multi byte sequence, replace the
            // first character with a C escape sequence.
            //
            static const char escapes[] = "\aa\bb\ff\nn\rr\tt\vv";
            const char *esc = strchr(escapes, *ip);
            if (esc)
            {
                *op++ = '\\';
                *op++ = esc[1];
            }
            else
            {
                *op++ = '\\';
                *op++ = '0' + ((*ip >> 6) & 3);
                *op++ = '0' + ((*ip >> 3) & 7);
                *op++ = '0' + (*ip & 7);
            }
            ++ip;
            --remainder;

            //
            // The mbtowc function's internal state will now
            // be "error" or broken, or otherwise useless.
            // Reset it so that we can keep going.
            //
            mbtowc((wchar_t *)0, (char *)0, 0);                 //lint !e418
        }
        else
        {
            //
            // the one wchar_t used n chars
            //
            ip += n;
            remainder -= n;
            ++op;
        }
    }

    //
    // change the locale back to the C locale
    //
    language_C();

    //
    // build the result from the image in "buf"
    //
    return wstr_n_from_wc(buf, op - buf);
}


void
wstr_to_mbs(const wstring_ty *s, char **result_p, size_t *result_size_p)
{
    wcs_to_mbs(s->wstr_text, s->wstr_length, result_p, result_size_p);
}


//
// NAME
//      wstr_n_from_wc - make string
//
// SYNOPSIS
//      wstring_ty *wstr_n_from_wc(wchar_t *s, size_t n);
//
// DESCRIPTION
//      The wstr_n_from_c function is used to make a string from an
//      array of wide characters.  No NUL terminator is assumed.
//
// RETURNS
//      wstring_ty* - a pointer to a string in dynamic memory.  Use
//      wstr_free when finished with.
//
// CAVEAT
//      The contents of the structure pointed to MUST NOT be altered.
//

wstring_ty *
wstr_n_from_wc(const wchar_t *s, size_t length)
{
    trace(("%s\n", __PRETTY_FUNCTION__));
    wstring_ty *p;

    if (!hash_modulus)
        wstr_initialize();
    wstr_hash_ty hash = hash_generate(s, length);

    wstr_hash_ty idx = hash & hash_mask;
    for (p = hash_table[idx]; p; p = p->wstr_next)
    {
        if
        (
            p->wstr_hash == hash
        &&
            p->wstr_length == length
        &&
            !memcmp(p->wstr_text, s, length * sizeof(wchar_t))
        )
        {
            p->wstr_references++;
            return p;
        }
    }

    p = (wstring_ty *)mem_alloc(sizeof(wstring_ty) + length * sizeof(wchar_t));
    p->wstr_hash = hash;
    p->wstr_length = length;
    p->wstr_references = 1;
    p->wstr_next = hash_table[idx];
    hash_table[idx] = p;
    memcpy(p->wstr_text, s, length * sizeof(wchar_t));
    p->wstr_text[length] = 0;

    hash_load++;
    while (hash_load * 10 > hash_modulus * 8)
        split();
    return p;
}


//
// NAME
//      wstr_copy - make a copy of a string
//
// SYNOPSIS
//      wstring_ty *wstr_copy(wstring_ty *s);
//
// DESCRIPTION
//      The wstr_copy function is used to make a copy of a string.
//
// RETURNS
//      wstring_ty* - a pointer to a string in dynamic memory.
//      Use wstr_free when finished with.
//
// CAVEAT
//      The contents of the structure pointed to MUST NOT be altered.
//

wstring_ty *
wstr_copy(wstring_ty *s)
{
    s->wstr_references++;
    return s;
}


//
// NAME
//      wstr_free - release a string
//
// SYNOPSIS
//      void wstr_free(wstring_ty *s);
//
// DESCRIPTION
//      The wstr_free function is used to indicate that a string hash been
//      finished with.
//
// RETURNS
//      void
//
// CAVEAT
//      This is the only way to release strings DO NOT use the free function.
//

void
wstr_free(wstring_ty *s)
{
    wstring_ty **spp;

    if (!s)
        return;
    if (s->wstr_references > 1)
    {
        s->wstr_references--;
        return;
    }

    //
    // find the hash bucket it was in,
    // and remove it
    //
    wstr_hash_ty idx = s->wstr_hash & hash_mask;
    for (spp = &hash_table[idx]; *spp; spp = &(*spp)->wstr_next)
    {
        if (*spp == s)
        {
            *spp = s->wstr_next;
            mem_free(s);
            --hash_load;
            return;
        }
    }

    //
    // should never reach here!
    //
    assert(!"attempted to free non-existent wstring (bug)");
    fatal_raw("attempted to free non-existent wstring (bug)");
}


//
// NAME
//      wstr_catenate - join two strings
//
// SYNOPSIS
//      wstring_ty *wstr_catenate(wstring_ty *, wstring_ty *);
//
// DESCRIPTION
//      The wstr_catenate function is used to concatenate two strings to form a
//      new string.
//
// RETURNS
//      wstring_ty* - a pointer to a string in dynamic memory.
//      Use wstr_free when finished with.
//
// CAVEAT
//      The contents of the structure pointed to MUST NOT be altered.
//

wstring_ty *
wstr_catenate(const wstring_ty *s1, const wstring_ty *s2)
{
    static wchar_t  *tmp;
    static size_t   tmplen;
    wstring_ty      *s;
    size_t          length;

    length = s1->wstr_length + s2->wstr_length;
    if (length > tmplen)
    {
        for (;;)
        {
            tmplen = tmplen * 2 + 8;
            if (length <= tmplen)
                break;
        }
        delete [] tmp;
        tmp = new wchar_t [tmplen];
    }
    memcpy(tmp, s1->wstr_text, s1->wstr_length * sizeof(wchar_t));
    memcpy
    (
        tmp + s1->wstr_length,
        s2->wstr_text,
        s2->wstr_length * sizeof(wchar_t)
    );
    s = wstr_n_from_wc(tmp, length);
    return s;
}


//
// NAME
//      wstr_cat_three - join three strings
//
// SYNOPSIS
//      wstring_ty *wstr_cat_three(wstring_ty *, wstring_ty *, wstring_ty *);
//
// DESCRIPTION
//      The wstr_cat_three function is used to concatenate three strings to form
//      a new string.
//
// RETURNS
//      wstring_ty* - a pointer to a string in dynamic memory.
//      Use wstr_free when finished with.
//
// CAVEAT
//      The contents of the structure pointed to MUST NOT be altered.
//

wstring_ty *
wstr_cat_three(const wstring_ty *s1, const wstring_ty *s2, const wstring_ty *s3)
{
    static wchar_t  *tmp;
    static size_t   tmplen;
    wstring_ty      *s;
    size_t          length;

    length = s1->wstr_length + s2->wstr_length + s3->wstr_length;
    if (tmplen < length)
    {
        for (;;)
        {
            tmplen = tmplen * 2 + 8;
            if (length <= tmplen)
                break;
        }
        delete [] tmp;
        tmp = new wchar_t [tmplen];
    }
    memcpy(tmp, s1->wstr_text, s1->wstr_length * sizeof(wchar_t));
    memcpy
    (
        tmp + s1->wstr_length,
        s2->wstr_text,
        s2->wstr_length * sizeof(wchar_t)
    );
    memcpy
    (
        tmp + s1->wstr_length + s2->wstr_length,
        s3->wstr_text,
        s3->wstr_length * sizeof(wchar_t)
    );
    s = wstr_n_from_wc(tmp, length);
    return s;
}


wstring_ty *
wstr_capitalize(const wstring_ty *wsp)
{
    static wchar_t  *buffer;
    static size_t   buflen;
    size_t          j;
    int             prev_was_alpha;

    if (wsp->wstr_length > buflen)
    {
        for (;;)
        {
            buflen = buflen * 2 + 8;
            if (wsp->wstr_length <= buflen)
                break;
        }
        delete [] buffer;
        buffer = new wchar_t [buflen];
    }
    language_human();
    prev_was_alpha = 0;
    for (j = 0; j < wsp->wstr_length; ++j)
    {
        wchar_t         c;

        c = wsp->wstr_text[j];
        if (iswlower(c))
        {
            if (!prev_was_alpha)
                c = towupper(c);
            prev_was_alpha = 1;
        }
        else if (iswupper(c))
        {
            if (prev_was_alpha)
                c = towlower(c);
            prev_was_alpha = 1;
        }
        else
            prev_was_alpha = 0;
        buffer[j] = c;
    }
    language_C();
    return wstr_n_from_wc(buffer, wsp->wstr_length);
}


wstring_ty *
wstr_to_upper(const wstring_ty *wsp)
{
    static wchar_t  *buffer;
    static size_t   buflen;
    size_t          j;

    if (wsp->wstr_length > buflen)
    {
        for (;;)
        {
            buflen = buflen * 2 + 8;
            if (wsp->wstr_length <= buflen)
                break;
        }
        delete [] buffer;
        buffer = new wchar_t [buflen];
    }
    language_human();
    for (j = 0; j < wsp->wstr_length; ++j)
    {
        wchar_t         c;

        c = wsp->wstr_text[j];
        if (iswlower(c))
            c = towupper(c);
        buffer[j] = c;
    }
    language_C();
    return wstr_n_from_wc(buffer, wsp->wstr_length);
}


wstring_ty *
wstr_to_lower(const wstring_ty *wsp)
{
    static wchar_t  *buffer;
    static size_t   buflen;
    size_t          j;

    if (wsp->wstr_length > buflen)
    {
        for (;;)
        {
            buflen = buflen * 2 + 8;
            if (wsp->wstr_length <= buflen)
                break;
        }
        delete [] buffer;
        buffer = new wchar_t [buflen];
    }
    language_human();
    for (j = 0; j < wsp->wstr_length; ++j)
    {
        wchar_t         c;

        c = wsp->wstr_text[j];
        if (iswupper(c))
            c = towlower(c);
        buffer[j] = c;
    }
    language_C();
    return wstr_n_from_wc(buffer, wsp->wstr_length);
}


wstring_ty *
wstr_to_ident(const wstring_ty *wsp)
{
    static wchar_t  *buffer;
    static size_t   buflen;
    size_t          j;

    if (wsp->wstr_length == 0)
        return wstr_from_c("_");
    if (wsp->wstr_length > buflen)
    {
        for (;;)
        {
            buflen = buflen * 2 + 8;
            if (wsp->wstr_length <= buflen)
                break;
        }
        delete [] buffer;
        buffer = new wchar_t [buflen];
    }
    language_human();
    for (j = 0; j < wsp->wstr_length; ++j)
    {
        wchar_t         c;

        c = wsp->wstr_text[j];
        if (!iswalnum(c))
            c = '_';
        buffer[j] = c;
    }
    if (iswdigit(buffer[0]))
        buffer[0] = '_';
    language_C();
    return wstr_n_from_wc(buffer, wsp->wstr_length);
}


//
// NAME
//      wstr_equal - test equality of strings
//
// SYNOPSIS
//      int wstr_equal(wstring_ty *, wstring_ty *);
//
// DESCRIPTION
//      The wstr_equal function is used to test if two strings are equal.
//
// RETURNS
//      int; zero if the strings are not equal, nonzero if the strings are
//      equal.
//
// CAVEAT
//      This function is implemented as a macro in strings.h
//

#ifndef wstr_equal

int
wstr_equal(const wstring_ty *s1, const wstring_ty *s2)
{
    return (s1 == s2);
}

#endif


wstring_ty *
str_to_wstr(const string_ty *s)
{
    return wstr_n_from_c(s->str_text, s->str_length);
}


string_ty *
wstr_to_str(const wstring_ty *wsp)
{
    char            *text;
    size_t          length;

    wstr_to_mbs(wsp, &text, &length);
    return str_n_from_c(text, length);
}


// vim: set ts=8 sw=4 et :