00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00035
00036
00037 #include "String.hpp"
00038 #include "Array.hpp"
00039 #include "StringStream.hpp"
00040 #include "UTF8Utils.hpp"
00041 #include <fstream>
00042 #include <cctype>
00043 #include <algorithm>
00044 #include <vector>
00045 #include <iostream>
00046 #include <cassert>
00047 #include <map>
00048 #include <set>
00049
00050 using namespace std;
00051 using namespace BLOCXX_NAMESPACE;
00052
00053 #if 1
00054 #define DEBUG(x) cout << x
00055 #else
00056 #define DEBUG(x)
00057 #endif
00058
00059 class StateMachine
00060 {
00061 public:
00062 static const int start = 0;
00063 static const int invalid = -1;
00064
00065 StateMachine()
00066 {
00067
00068 m_states.resize(1);
00069 }
00070 int getTransition(int state, UInt8 input) const
00071 {
00072 assert(m_states.size() > state);
00073 return m_states[state].transitions[input].toState;
00074 }
00075
00076 int getStateStr(int state, UInt8 input) const
00077 {
00078 assert(m_states.size() > state);
00079 return m_states[state].transitions[input].inputSelection;
00080 }
00081
00082 int addState()
00083 {
00084 m_states.resize(m_states.size() + 1);
00085 int state = m_states.size() - 1;
00086 m_states[state].acceptState = false;
00087 return state;
00088 }
00089
00090 void setStateAccept(int state)
00091 {
00092 assert(m_states.size() > state);
00093 m_states[state].acceptState = true;
00094 }
00095
00096
00097
00098 void addTransition(int transitionsState, UInt8 input, int nextState, int inputSelection)
00099 {
00100 assert(m_states.size() > transitionsState);
00101 assert(m_states.size() > nextState);
00102 assert(inputSelection == 1 || inputSelection == 2);
00103 assert(m_states[transitionsState].transitions[input].toState == invalid);
00104 assert(m_states[transitionsState].transitions[input].inputSelection == -1);
00105 m_states[transitionsState].transitions[input].toState = nextState;
00106 m_states[transitionsState].transitions[input].inputSelection = inputSelection;
00107 }
00108
00109 void debug() const
00110 {
00111 DEBUG("# states = " << m_states.size() << '\n');
00112 }
00113
00114 void removeDuplicateState(int state1, int state2)
00115 {
00116 assert(state1 < state2);
00117 m_states.erase(m_states.begin() + state2);
00118 for (int i = 0; i < m_states.size(); ++i)
00119 {
00120 for (int j = 0; j < m_states[i].transitions.size(); ++j)
00121 {
00122 if (m_states[i].transitions[j].toState == state2)
00123 {
00124 m_states[i].transitions[j].toState = state1;
00125 }
00126 else if (m_states[i].transitions[j].toState > state2)
00127 {
00128
00129 --m_states[i].transitions[j].toState;
00130 }
00131 }
00132 }
00133 }
00134
00135
00136 struct transition_t
00137 {
00138 transition_t()
00139 : inputSelection(-1)
00140 , toState(invalid)
00141 {
00142 }
00143 int inputSelection;
00144 int toState;
00145
00146 friend bool operator==(const transition_t& x, const transition_t& y)
00147 {
00148 return (x.inputSelection == y.inputSelection) && (x.toState == y.toState);
00149 }
00150 };
00151
00152 struct state_t
00153 {
00154 state_t()
00155 : transitions()
00156 , acceptState(false)
00157 {
00158 transitions.resize(256);
00159 }
00160 vector<transition_t> transitions;
00161 bool acceptState;
00162
00163 friend bool operator==(const state_t& x, const state_t& y)
00164 {
00165 return (x.transitions == y.transitions) && (x.acceptState == y.acceptState);
00166 }
00167 };
00168 vector<state_t> m_states;
00169 };
00170
00171 StateMachine stateMachine;
00172 typedef std::multimap<String, String> mmap_t;
00173 typedef mmap_t::const_iterator ci_t;
00174 std::multimap<String, String> caseFoldingEntries;
00175
00176 int followOrAddTransition(int curTransition, UInt8 input, int aux)
00177 {
00178 int nextTransition = stateMachine.getTransition(curTransition, input);
00179 if (nextTransition == StateMachine::invalid)
00180 {
00181
00182 int newState = stateMachine.addState();
00183 DEBUG("added new state " << newState);
00184
00185 stateMachine.addTransition(curTransition, input, newState, aux);
00186 DEBUG(" and transition from " << curTransition << " to " << newState << " on input " << aux << " = \\x" << int(input) << endl);
00187 nextTransition = newState;
00188 }
00189 else
00190 {
00191 if (stateMachine.getStateStr(curTransition, input) == -1)
00192 {
00193 DEBUG("?? stateMachine.getStateStr(curTransition, input) == -1 ??" << endl);
00194
00195 }
00196 DEBUG("transition already exists. curTransition = " << curTransition << " aux = " << aux << " getStateStr = " << stateMachine.getStateStr(curTransition, input) << endl);
00197
00198 assert(stateMachine.getStateStr(curTransition, input) == aux);
00199 }
00200 return nextTransition;
00201 }
00202
00203 void printStrings(const String& str1, const String& str2)
00204 {
00205 DEBUG("buildTransitions: str1 = \"");
00206 for (int i = 0; i < str1.length(); ++i)
00207 {
00208 DEBUG( hex << "\\x" << (int)(unsigned char)str1[i]);
00209 }
00210 DEBUG("\" str2 = \"");
00211 for (int i = 0; i < str2.length(); ++i)
00212 {
00213 DEBUG( hex << "\\x" << (int)(unsigned char)str2[i]);
00214 }
00215 DEBUG( "\"\n");
00216 }
00217
00218
00219 void buildTransitions(const String& str1, const String& str2)
00220 {
00221 printStrings(str1, str2);
00222
00223
00224 int trans = StateMachine::start;
00225 int pos1 = 0;
00226 int pos2 = 0;
00227 while (pos1 < str1.length() || pos2 < str2.length())
00228 {
00229 if (str1[pos1])
00230 trans = followOrAddTransition(trans, str1[pos1++], 1);
00231
00232 if (str2[pos2])
00233 trans = followOrAddTransition(trans, str2[pos2++], 2);
00234 }
00235 DEBUG( "setStateAccept(" << trans << ")\n");
00236 stateMachine.setStateAccept(trans);
00237 }
00238
00239 struct processLine
00240 {
00241 void operator()(const String& s) const
00242 {
00243 if (s.empty() || !isxdigit(s[0]))
00244 return;
00245
00246 DEBUG("processLine: " << s << endl);
00247 StringArray a = s.tokenize(";");
00248 assert(a.size() >= 3);
00249 UInt32 c1 = a[0].toUInt32(16);
00250 StringArray a2 = a[2].tokenize(" ");
00251 Array<UInt32> c2chars(a2.size());
00252 for (size_t i = 0; i < a2.size(); ++i)
00253 {
00254 c2chars[i] = a2[i].toUInt32(16);
00255 }
00256 String str1 = UTF8Utils::UCS4toUTF8(c1);
00257 String str2;
00258 for (size_t i = 0; i < c2chars.size(); ++i)
00259 {
00260 str2 += UTF8Utils::UCS4toUTF8(c2chars[i]);
00261 }
00262
00263 caseFoldingEntries.insert(std::make_pair(str1, str2));
00264 caseFoldingEntries.insert(std::make_pair(str2, str1));
00265
00266
00267 }
00268 };
00269
00270 struct isForInput : public unary_function<StateMachine::transition_t, bool>
00271 {
00272 isForInput(int input) : m_input(input) {}
00273 int m_input;
00274 bool operator()(const StateMachine::transition_t& t)
00275 {
00276 return t.inputSelection == m_input;
00277 }
00278 };
00279
00280 void outputHeader()
00281 {
00282 cout << "/*******************************************************************************\n";
00283 cout << "* Copyright (C) 2004 Vintela, Inc. All rights reserved.\n";
00284 cout << "* Copyright (C) 2005 Novell, Inc. All rights reserved.\n";
00285 cout << "*\n";
00286 cout << "* Redistribution and use in source and binary forms, with or without\n";
00287 cout << "* modification, are permitted provided that the following conditions are met:\n";
00288 cout << "*\n";
00289 cout << "* - Redistributions of source code must retain the above copyright notice,\n";
00290 cout << "* this list of conditions and the following disclaimer.\n";
00291 cout << "*\n";
00292 cout << "* - Redistributions in binary form must reproduce the above copyright notice,\n";
00293 cout << "* this list of conditions and the following disclaimer in the documentation\n";
00294 cout << "* and/or other materials provided with the distribution.\n";
00295 cout << "*\n";
00296 cout << "* - Neither the name of Vintela, Inc., Novell, Inc., nor the names of its\n";
00297 cout << "* contributors may be used to endorse or promote products derived from this\n";
00298 cout << "* software without specific prior written permission.\n";
00299 cout << "*\n";
00300 cout << "* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''\n";
00301 cout << "* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n";
00302 cout << "* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n";
00303 cout << "* ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc., Novell, Inc., OR THE\n";
00304 cout << "* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n";
00305 cout << "* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n";
00306 cout << "* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;\n";
00307 cout << "* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\n";
00308 cout << "* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR\n";
00309 cout << "* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF\n";
00310 cout << "* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n";
00311 cout << "*******************************************************************************/\n";
00312 cout << "\n// Do NOT modify this file. It was generated by GenCaseFoldingCompare.cpp\n";
00313 cout << "// If this file needs to be modified, change the generator and regenerate it.\n";
00314 }
00315
00316 void outputTransitions(const StateMachine::state_t& state, int inputSelection, bool outputDefault)
00317 {
00318 for (int j = 0; j < state.transitions.size(); ++j)
00319 {
00320 if (state.transitions[j].toState != -1 && state.transitions[j].inputSelection == inputSelection)
00321 {
00322 cout << "\t\tcase 0x" << hex << j << ": goto state" << state.transitions[j].toState << ";\n";
00323 }
00324 }
00325 if (outputDefault)
00326 {
00327 cout << "\t\tdefault: goto no_match;\n";
00328 }
00329 cout << "\t}\n";
00330 }
00331
00332 void outputFirstState(const StateMachine::state_t& state)
00333 {
00334 cout << "\tswitch (*(str1++)){\n";
00335
00336 cout << "\t\tcase 0x0: goto zero;\n";
00337 outputTransitions(state, 1, true);
00338 }
00339
00340 void outputSwitch(const StateMachine::state_t& state, int inputSelection, bool outputDefault)
00341 {
00342 cout << "\tswitch (*(str" << inputSelection << "++)){\n";
00343 outputTransitions(state, inputSelection, outputDefault);
00344 }
00345
00346 void outputCode()
00347 {
00348 outputHeader();
00349
00350 cout << "#include \"BLOCXX_config.h\"\n";
00351 cout << "#include \"UTF8Utils.hpp\"\n";
00352 cout << "\nnamespace BLOCXX_NAMESPACE\n{\n";
00353 cout << "namespace UTF8Utils\n{\n";
00354 cout << "\n/////////////////////////////////////////////////////////////////////////////\n";
00355
00356 cout << "int compareToIgnoreCase(const char* cstr1, const char* cstr2)\n";
00357 cout << "{\n";
00358 cout << "\tconst unsigned char* str1 = reinterpret_cast<const unsigned char*>(cstr1);\n";
00359 cout << "\tconst unsigned char* str2 = reinterpret_cast<const unsigned char*>(cstr2);\n";
00360 cout << "\tconst unsigned char* str1marker = 0;\n";
00361 cout << "\tconst unsigned char* str2marker = 0;\n";
00362 cout << "\tgoto state0;\n";
00363 cout << "no_match:\n";
00364 cout << "\tif (str1marker) {\n";
00365 cout << "\t\tstr1 = str1marker; str1marker = 0;\n";
00366 cout << "\t\tstr2 = str2marker; str2marker = 0;\n";
00367 cout << "\t\tgoto state0;\n";
00368 cout << "\t}\n";
00369 cout << "\treturn *(str1-1) - *(str2-1);\n";
00370 cout << "zero:\n";
00371 cout << "\treturn 0 - *str2;\n";
00372
00373 for (int i = 0; i < stateMachine.m_states.size(); ++i)
00374 {
00375
00376 cout << "state" << i << ":\n";
00377 int c1 = count_if (stateMachine.m_states[i].transitions.begin(), stateMachine.m_states[i].transitions.end(), isForInput(1));
00378 int c2 = count_if (stateMachine.m_states[i].transitions.begin(), stateMachine.m_states[i].transitions.end(), isForInput(2));
00379
00380 if (i == 0)
00381 {
00382 outputFirstState(stateMachine.m_states[0]);
00383 }
00384 else
00385 {
00386
00387 if ((c1 || c2) && stateMachine.m_states[i].acceptState)
00388 {
00389 cout << "\tstr1marker = str1;\n";
00390 cout << "\tstr2marker = str2;\n";
00391 }
00392 if (c1)
00393 {
00394 outputSwitch(stateMachine.m_states[i], 1, c2 == 0);
00395 }
00396
00397 if (c1 && c2)
00398 {
00399 cout << "\t--str1;\n";
00400 }
00401 if (c2)
00402 {
00403 outputSwitch(stateMachine.m_states[i], 2, true);
00404 }
00405 if (c1 == 0 && c2 == 0)
00406 {
00407 if (stateMachine.m_states[i].acceptState)
00408 {
00409 cout << "\tgoto state0;\n";
00410 }
00411 else
00412 {
00413 cout << "\tgoto no_match;\n";
00414 }
00415 }
00416 }
00417
00418
00419 }
00420 cout << "}\n\n";
00421 cout << "} // end namespace UTF8Utils\n";
00422 cout << "} // end namespace BLOCXX_NAMESPACE\n\n";
00423 }
00424
00425 bool findDuplicateStates(int& state1, int& state2)
00426 {
00427 for (int i = stateMachine.m_states.size() - 2; i > -1; --i)
00428 {
00429 for (int j = stateMachine.m_states.size() -1; j > i; --j)
00430 {
00431 if (stateMachine.m_states[i] == stateMachine.m_states[j])
00432 {
00433 state1 = i;
00434 state2 = j;
00435 DEBUG("found duplicate states: " << i << " and " << j << endl);
00436 return true;
00437 }
00438 }
00439 }
00440 return false;
00441 }
00442
00443 void minimizeStateMachine()
00444 {
00445
00446
00447 DEBUG("minimizing state machine\n");
00448 int state1 = StateMachine::invalid;
00449 int state2 = StateMachine::invalid;
00450 while (findDuplicateStates(state1, state2))
00451 {
00452 stateMachine.removeDuplicateState(state1, state2);
00453 }
00454 }
00455
00456 void getEntriesFor(const String& key, set<String>& rval)
00457 {
00458 for (ci_t ci = caseFoldingEntries.lower_bound(key);
00459 ci->first == key;
00460 ++ci)
00461 {
00462 if (rval.find(ci->second) == rval.end())
00463 {
00464 rval.insert(ci->second);
00465 getEntriesFor(ci->second, rval);
00466 }
00467 }
00468 }
00469
00470 bool haveEntry(const String& key, const String& val)
00471 {
00472 for (ci_t ci = caseFoldingEntries.lower_bound(key);
00473 ci->first == key;
00474 ++ci)
00475 {
00476 if (ci->second == val)
00477 return true;
00478 }
00479 return false;
00480 }
00481
00482 void calculateTransitiveClosure()
00483 {
00484 DEBUG("calculateTransitiveClosure\n");
00485 start_over:
00486 for (ci_t ci = caseFoldingEntries.begin();
00487 ci != caseFoldingEntries.end();
00488 ++ci)
00489 {
00490 set<String> newEntries;
00491 getEntriesFor(ci->second, newEntries);
00492 bool addedAnEntry = false;
00493 String key = ci->first;
00494 for (set<String>::const_iterator curEntry = newEntries.begin(); curEntry != newEntries.end(); ++curEntry)
00495 {
00496 if (!haveEntry(key, *curEntry))
00497 {
00498 caseFoldingEntries.insert(std::make_pair(key, *curEntry));
00499 addedAnEntry = true;
00500 }
00501 }
00502
00503 if (addedAnEntry)
00504 goto start_over;
00505 }
00506 }
00507
00508 void buildStateMachine()
00509 {
00510 for (ci_t ci = caseFoldingEntries.begin();
00511 ci != caseFoldingEntries.end();
00512 ++ci)
00513 {
00514 buildTransitions(ci->first, ci->second);
00515 }
00516 }
00517
00518 int main(int argc, char** argv)
00519 {
00520 if (argc != 2)
00521 {
00522 cerr << "must pass filename (to CaseFolding.txt)" << endl;
00523 return 1;
00524 }
00525
00526 ifstream in(argv[1]);
00527 if (!in)
00528 {
00529 cerr << "could not open " << argv[1] << endl;
00530 return 1;
00531 }
00532
00533
00534 for (int i = 1; i < 256; ++i)
00535 {
00536 String s = String(char(i));
00537 caseFoldingEntries.insert(std::make_pair(s, s));
00538
00539 }
00540
00541
00542 OStringStream ss;
00543 ss << in.rdbuf();
00544 String s = ss.toString();
00545 StringArray sa = s.tokenize("\n");
00546 for_each(sa.begin(), sa.end(), processLine());
00547
00548 calculateTransitiveClosure();
00549 buildStateMachine();
00550
00551
00552 minimizeStateMachine();
00553
00554
00555 outputCode();
00556 }
00557