Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members

GenCaseFoldingCompare.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002 * Copyright (C) 2004 Vintela, Inc. All rights reserved.
00003 * Copyright (C) 2005 Novell, Inc. All rights reserved.
00004 *
00005 * Redistribution and use in source and binary forms, with or without
00006 * modification, are permitted provided that the following conditions are met:
00007 *
00008 *  - Redistributions of source code must retain the above copyright notice,
00009 *    this list of conditions and the following disclaimer.
00010 *
00011 *  - Redistributions in binary form must reproduce the above copyright notice,
00012 *    this list of conditions and the following disclaimer in the documentation
00013 *    and/or other materials provided with the distribution.
00014 *
00015 *  - Neither the name of Vintela, Inc., Novell, Inc., nor the names of its
00016 *    contributors may be used to endorse or promote products derived from this
00017 *    software without specific prior written permission.
00018 *
00019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
00020 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00021 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00022 * ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc., Novell, Inc., OR THE
00023 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00024 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00025 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
00026 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00027 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
00028 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
00029 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030 *******************************************************************************/
00035 // The source of the Unicode data is: http://www.unicode.org/Public/UNIDATA/
00036 
00037 #include "String.hpp"
00038 #include "Array.hpp"
00039 #include "StringStream.hpp"
00040 #include "UTF8Utils.hpp"
00041 #include <fstream>
00042 #include <cctype> // for isxdigit
00043 #include <algorithm>
00044 #include <vector>
00045 #include <iostream>
00046 #include <cassert>
00047 #include <map>
00048 #include <set>
00049 
00050 using namespace std;
00051 using namespace BLOCXX_NAMESPACE;
00052 
00053 #if 1
00054 #define DEBUG(x) cout << x
00055 #else
00056 #define DEBUG(x)
00057 #endif
00058 
00059 class StateMachine
00060 {
00061 public:
00062    static const int start = 0;
00063    static const int invalid = -1;
00064 
00065    StateMachine()
00066    {
00067       // create the start state
00068       m_states.resize(1);
00069    }
00070    int getTransition(int state, UInt8 input) const
00071    {
00072       assert(m_states.size() > state);
00073       return m_states[state].transitions[input].toState;
00074    }
00075 
00076    int getStateStr(int state, UInt8 input) const
00077    {
00078       assert(m_states.size() > state);
00079       return m_states[state].transitions[input].inputSelection;
00080    }
00081 
00082    int addState()
00083    {
00084       m_states.resize(m_states.size() + 1);
00085       int state = m_states.size() - 1;
00086       m_states[state].acceptState = false;
00087       return state;
00088    }
00089 
00090    void setStateAccept(int state)
00091    {
00092       assert(m_states.size() > state);
00093       m_states[state].acceptState = true;
00094    }
00095 
00096    // val.inputSelection == 1, increment the pointer to str1
00097    // val.inputSelection == 2, increment the pointer to str2
00098    void addTransition(int transitionsState, UInt8 input, int nextState, int inputSelection)
00099    {
00100       assert(m_states.size() > transitionsState);
00101       assert(m_states.size() > nextState);
00102       assert(inputSelection == 1 || inputSelection == 2);
00103       assert(m_states[transitionsState].transitions[input].toState == invalid);
00104       assert(m_states[transitionsState].transitions[input].inputSelection == -1);
00105       m_states[transitionsState].transitions[input].toState = nextState;
00106       m_states[transitionsState].transitions[input].inputSelection = inputSelection;
00107    }
00108 
00109    void debug() const
00110    {
00111       DEBUG("# states = " << m_states.size() << '\n');
00112    }
00113 
00114    void removeDuplicateState(int state1, int state2)
00115    {
00116       assert(state1 < state2);
00117       m_states.erase(m_states.begin() + state2);
00118       for (int i = 0; i < m_states.size(); ++i)
00119       {
00120          for (int j = 0; j < m_states[i].transitions.size(); ++j)
00121          {
00122             if (m_states[i].transitions[j].toState == state2)
00123             {
00124                m_states[i].transitions[j].toState = state1;
00125             }
00126             else if (m_states[i].transitions[j].toState > state2)
00127             {
00128                // we removed a state, so we just decrement it by one.
00129                --m_states[i].transitions[j].toState;
00130             }
00131          }
00132       }
00133    }
00134    
00135    // transitions, and whether state applies to str1 or str2
00136    struct transition_t
00137    {
00138       transition_t()
00139       : inputSelection(-1)
00140       , toState(invalid)
00141       {
00142       }
00143       int inputSelection;
00144       int toState;
00145 
00146       friend bool operator==(const transition_t& x, const transition_t& y)
00147       {
00148          return (x.inputSelection == y.inputSelection) && (x.toState == y.toState);
00149       }
00150    };
00151 
00152    struct state_t
00153    {
00154       state_t() 
00155       : transitions()
00156       , acceptState(false)
00157       {
00158          transitions.resize(256);
00159       }
00160       vector<transition_t> transitions;
00161       bool acceptState;
00162 
00163       friend bool operator==(const state_t& x, const state_t& y)
00164       {
00165          return (x.transitions == y.transitions) && (x.acceptState == y.acceptState);
00166       }
00167    };
00168    vector<state_t> m_states;
00169 };
00170 
00171 StateMachine stateMachine;
00172 typedef std::multimap<String, String> mmap_t;
00173 typedef mmap_t::const_iterator ci_t;
00174 std::multimap<String, String> caseFoldingEntries;
00175 
00176 int followOrAddTransition(int curTransition, UInt8 input, int aux)
00177 {
00178    int nextTransition = stateMachine.getTransition(curTransition, input);
00179    if (nextTransition == StateMachine::invalid)
00180    {
00181       // no transition, add one
00182       int newState = stateMachine.addState();
00183       DEBUG("added new state " << newState);
00184       //stateMachine.setInputSelection(curTransition, input, aux);
00185       stateMachine.addTransition(curTransition, input, newState, aux);
00186       DEBUG(" and transition from " << curTransition << " to " << newState << " on input " << aux << " = \\x" << int(input) << endl);
00187       nextTransition = newState;
00188    }
00189    else
00190    {
00191       if (stateMachine.getStateStr(curTransition, input) == -1)
00192       {
00193          DEBUG("?? stateMachine.getStateStr(curTransition, input) == -1 ??" << endl);
00194          //stateMachine.setInputSelection(curTransition, input, aux);
00195       }
00196       DEBUG("transition already exists.  curTransition = " << curTransition << " aux = " << aux << " getStateStr = " << stateMachine.getStateStr(curTransition, input) << endl);
00197       // found a transition
00198       assert(stateMachine.getStateStr(curTransition, input) == aux);
00199    }
00200    return nextTransition;
00201 }
00202 
00203 void printStrings(const String& str1, const String& str2)
00204 {
00205    DEBUG("buildTransitions: str1 = \"");
00206    for (int i = 0; i < str1.length(); ++i)
00207    {
00208       DEBUG( hex << "\\x" << (int)(unsigned char)str1[i]);
00209    }
00210    DEBUG("\" str2 = \"");
00211    for (int i = 0; i < str2.length(); ++i)
00212    {
00213       DEBUG( hex << "\\x" << (int)(unsigned char)str2[i]);
00214    }
00215    DEBUG( "\"\n");
00216 }
00217 
00218 
00219 void buildTransitions(const String& str1, const String& str2)
00220 {
00221    printStrings(str1, str2);
00222 
00223    // do it transitions for str1/str2
00224    int trans = StateMachine::start;
00225    int pos1 = 0;
00226    int pos2 = 0;
00227    while (pos1 < str1.length() || pos2 < str2.length())
00228    {
00229       if (str1[pos1])
00230          trans = followOrAddTransition(trans, str1[pos1++], 1);
00231 
00232       if (str2[pos2])
00233          trans = followOrAddTransition(trans, str2[pos2++], 2);
00234    }
00235    DEBUG( "setStateAccept(" << trans << ")\n");
00236    stateMachine.setStateAccept(trans);
00237 }
00238 
00239 struct processLine
00240 {
00241    void operator()(const String& s) const
00242    {
00243       if (s.empty() || !isxdigit(s[0]))
00244          return;
00245 
00246       DEBUG("processLine: " << s << endl);
00247       StringArray a = s.tokenize(";"); // split up fields
00248       assert(a.size() >= 3);
00249       UInt32 c1 = a[0].toUInt32(16);
00250       StringArray a2 = a[2].tokenize(" "); // split up chars are separated by spaces
00251       Array<UInt32> c2chars(a2.size());
00252       for (size_t i = 0; i < a2.size(); ++i)
00253       {
00254          c2chars[i] = a2[i].toUInt32(16);
00255       }
00256       String str1 = UTF8Utils::UCS4toUTF8(c1);
00257       String str2;
00258       for (size_t i = 0; i < c2chars.size(); ++i)
00259       {
00260          str2 += UTF8Utils::UCS4toUTF8(c2chars[i]);
00261       }
00262 
00263       caseFoldingEntries.insert(std::make_pair(str1, str2));
00264       caseFoldingEntries.insert(std::make_pair(str2, str1));
00265       //buildTransitions(str1, str2);
00266       //buildTransitions(str2, str1);
00267    }
00268 };
00269 
00270 struct isForInput : public unary_function<StateMachine::transition_t, bool>
00271 {
00272    isForInput(int input) : m_input(input) {}
00273    int m_input;
00274    bool operator()(const StateMachine::transition_t& t)
00275    {
00276       return t.inputSelection == m_input;
00277    }
00278 };
00279 
00280 void outputHeader()
00281 {
00282    cout << "/*******************************************************************************\n";
00283    cout << "* Copyright (C) 2004 Vintela, Inc. All rights reserved.\n";
00284    cout << "* Copyright (C) 2005 Novell, Inc. All rights reserved.\n";
00285    cout << "*\n";
00286    cout << "* Redistribution and use in source and binary forms, with or without\n";
00287    cout << "* modification, are permitted provided that the following conditions are met:\n";
00288    cout << "*\n";
00289    cout << "*  - Redistributions of source code must retain the above copyright notice,\n";
00290    cout << "*    this list of conditions and the following disclaimer.\n";
00291    cout << "*\n";
00292    cout << "*  - Redistributions in binary form must reproduce the above copyright notice,\n";
00293    cout << "*    this list of conditions and the following disclaimer in the documentation\n";
00294    cout << "*    and/or other materials provided with the distribution.\n";
00295    cout << "*\n";
00296    cout << "*  - Neither the name of Vintela, Inc., Novell, Inc., nor the names of its\n";
00297    cout << "*    contributors may be used to endorse or promote products derived from this\n";
00298    cout << "*    software without specific prior written permission.\n";
00299    cout << "*\n";
00300    cout << "* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''\n";
00301    cout << "* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n";
00302    cout << "* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n";
00303    cout << "* ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc., Novell, Inc., OR THE\n";
00304    cout << "* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n";
00305    cout << "* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n";
00306    cout << "* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;\n";
00307    cout << "* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\n";
00308    cout << "* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR\n";
00309    cout << "* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF\n";
00310    cout << "* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n";
00311    cout << "*******************************************************************************/\n";
00312    cout << "\n// Do NOT modify this file.  It was generated by GenCaseFoldingCompare.cpp\n";
00313    cout << "// If this file needs to be modified, change the generator and regenerate it.\n";
00314 }
00315 
00316 void outputTransitions(const StateMachine::state_t& state, int inputSelection, bool outputDefault)
00317 {
00318    for (int j = 0; j < state.transitions.size(); ++j)
00319    {
00320       if (state.transitions[j].toState != -1 && state.transitions[j].inputSelection == inputSelection)
00321       {
00322          cout <<  "\t\tcase 0x" << hex << j << ": goto state"  << state.transitions[j].toState << ";\n";
00323       }
00324    }
00325    if (outputDefault)
00326    {
00327       cout << "\t\tdefault: goto no_match;\n";
00328    }
00329    cout << "\t}\n";
00330 }
00331 
00332 void outputFirstState(const StateMachine::state_t& state)
00333 {
00334    cout << "\tswitch (*(str1++)){\n";
00335    // first state has to handle 0
00336    cout << "\t\tcase 0x0: goto zero;\n";
00337    outputTransitions(state, 1, true);
00338 }
00339 
00340 void outputSwitch(const StateMachine::state_t& state, int inputSelection, bool outputDefault)
00341 {
00342    cout << "\tswitch (*(str" << inputSelection << "++)){\n";
00343    outputTransitions(state, inputSelection, outputDefault);
00344 }
00345 
00346 void outputCode()
00347 {
00348    outputHeader();
00349 
00350    cout << "#include \"BLOCXX_config.h\"\n";
00351    cout << "#include \"UTF8Utils.hpp\"\n";
00352    cout << "\nnamespace BLOCXX_NAMESPACE\n{\n";
00353    cout << "namespace UTF8Utils\n{\n";
00354    cout << "\n/////////////////////////////////////////////////////////////////////////////\n";
00355 
00356    cout << "int compareToIgnoreCase(const char* cstr1, const char* cstr2)\n";
00357    cout << "{\n";
00358    cout << "\tconst unsigned char* str1 = reinterpret_cast<const unsigned char*>(cstr1);\n";
00359    cout << "\tconst unsigned char* str2 = reinterpret_cast<const unsigned char*>(cstr2);\n";
00360    cout << "\tconst unsigned char* str1marker = 0;\n";
00361    cout << "\tconst unsigned char* str2marker = 0;\n";
00362    cout << "\tgoto state0;\n";
00363    cout << "no_match:\n";
00364    cout << "\tif (str1marker) {\n";
00365    cout << "\t\tstr1 = str1marker; str1marker = 0;\n";
00366    cout << "\t\tstr2 = str2marker; str2marker = 0;\n";
00367    cout << "\t\tgoto state0;\n";
00368    cout << "\t}\n";
00369    cout << "\treturn *(str1-1) - *(str2-1);\n";
00370    cout << "zero:\n";
00371    cout << "\treturn 0 - *str2;\n";
00372 
00373    for (int i = 0; i < stateMachine.m_states.size(); ++i)
00374    {
00375 
00376       cout << "state" << i << ":\n";
00377       int c1 = count_if (stateMachine.m_states[i].transitions.begin(), stateMachine.m_states[i].transitions.end(), isForInput(1));
00378       int c2 = count_if (stateMachine.m_states[i].transitions.begin(), stateMachine.m_states[i].transitions.end(), isForInput(2));
00379 
00380       if (i == 0)
00381       {
00382          outputFirstState(stateMachine.m_states[0]);
00383       }
00384       else
00385       {
00386          // we're in an accept state with outgoing transitions, we need to save our position
00387          if ((c1 || c2) && stateMachine.m_states[i].acceptState) 
00388          {
00389             cout << "\tstr1marker = str1;\n";
00390             cout << "\tstr2marker = str2;\n";
00391          }
00392          if (c1)
00393          {
00394             outputSwitch(stateMachine.m_states[i], 1, c2 == 0);
00395          }
00396          // need to rewind str1 in the case of 2 switches and the first one doesn't match
00397          if (c1 && c2)
00398          {
00399             cout << "\t--str1;\n";
00400          }
00401          if (c2)
00402          {
00403             outputSwitch(stateMachine.m_states[i], 2, true); // true means ouput the default
00404          }
00405          if (c1 == 0 && c2 == 0)
00406          {
00407             if (stateMachine.m_states[i].acceptState)
00408             {
00409                cout << "\tgoto state0;\n";
00410             }
00411             else
00412             {
00413                cout << "\tgoto no_match;\n";
00414             }
00415          }
00416       }
00417 
00418 
00419    }
00420    cout << "}\n\n";
00421    cout << "} // end namespace UTF8Utils\n";
00422    cout << "} // end namespace BLOCXX_NAMESPACE\n\n";
00423 }
00424 
00425 bool findDuplicateStates(int& state1, int& state2)
00426 {
00427    for (int i = stateMachine.m_states.size() - 2; i > -1; --i)
00428    {
00429       for (int j = stateMachine.m_states.size() -1; j > i; --j)
00430       {
00431          if (stateMachine.m_states[i] == stateMachine.m_states[j])
00432          {
00433             state1 = i;
00434             state2 = j;
00435             DEBUG("found duplicate states: " << i << " and " << j << endl);
00436             return true;
00437          }
00438       }
00439    }
00440    return false;
00441 }
00442 
00443 void minimizeStateMachine()
00444 {
00445    // this is a horribly inefficient way of doing this, but it works, was 
00446    // simple to code, and it only needs to be run once.
00447    DEBUG("minimizing state machine\n");
00448    int state1 = StateMachine::invalid;
00449    int state2 = StateMachine::invalid;
00450    while (findDuplicateStates(state1, state2))
00451    {
00452       stateMachine.removeDuplicateState(state1, state2);
00453    }
00454 }
00455 
00456 void getEntriesFor(const String& key, set<String>& rval)
00457 {
00458    for (ci_t ci = caseFoldingEntries.lower_bound(key);
00459       ci->first == key;
00460       ++ci)
00461    {
00462       if (rval.find(ci->second) == rval.end())
00463       {
00464          rval.insert(ci->second);
00465          getEntriesFor(ci->second, rval);
00466       }
00467    }
00468 }
00469 
00470 bool haveEntry(const String& key, const String& val)
00471 {
00472    for (ci_t ci = caseFoldingEntries.lower_bound(key);
00473       ci->first == key;
00474       ++ci)
00475    {
00476       if (ci->second == val)
00477          return true;
00478    }
00479    return false;
00480 }
00481 
00482 void calculateTransitiveClosure()
00483 {
00484    DEBUG("calculateTransitiveClosure\n");
00485 start_over:
00486    for (ci_t ci = caseFoldingEntries.begin();
00487       ci != caseFoldingEntries.end();
00488       ++ci)
00489    {
00490       set<String> newEntries;
00491       getEntriesFor(ci->second, newEntries);
00492       bool addedAnEntry = false;
00493       String key = ci->first;  // make a copy since the iterator may be invalidated after an insert
00494       for (set<String>::const_iterator curEntry = newEntries.begin(); curEntry != newEntries.end(); ++curEntry)
00495       {
00496          if (!haveEntry(key, *curEntry))
00497          {
00498             caseFoldingEntries.insert(std::make_pair(key, *curEntry));
00499             addedAnEntry = true;
00500          }
00501       }
00502 
00503       if (addedAnEntry)
00504          goto start_over; // since the iterators may be invalidated.
00505    }
00506 }
00507 
00508 void buildStateMachine()
00509 {
00510    for (ci_t ci = caseFoldingEntries.begin();
00511       ci != caseFoldingEntries.end();
00512       ++ci)
00513    {
00514       buildTransitions(ci->first, ci->second);
00515    }
00516 }
00517 
00518 int main(int argc, char** argv)
00519 {
00520    if (argc != 2)
00521    {
00522       cerr << "must pass filename (to CaseFolding.txt)" << endl;
00523       return 1;
00524    }
00525 
00526    ifstream in(argv[1]);
00527    if (!in)
00528    {
00529       cerr << "could not open " << argv[1] << endl;
00530       return 1;
00531    }
00532 
00533    // add transitions for equal matches
00534    for (int i = 1; i < 256; ++i)
00535    {
00536       String s = String(char(i));
00537       caseFoldingEntries.insert(std::make_pair(s, s));
00538       //buildTransitions(s, s);
00539    }
00540 
00541    // read in a process the input file
00542    OStringStream ss;
00543    ss << in.rdbuf();
00544    String s = ss.toString();
00545    StringArray sa = s.tokenize("\n");
00546    for_each(sa.begin(), sa.end(), processLine());
00547 
00548    calculateTransitiveClosure();
00549    buildStateMachine();
00550 
00551    // disable duplicate states
00552    minimizeStateMachine();
00553 
00554    // now generate the code:
00555    outputCode();
00556 }
00557 

Generated on Mon Sep 12 23:56:34 2005 for blocxx by  doxygen 1.4.4