Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members

GenCaseMappings.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002 * Copyright (C) 2004 Vintela, Inc. All rights reserved.
00003 * Copyright (C) 2005 Novell, Inc. All rights reserved.
00004 *
00005 * Redistribution and use in source and binary forms, with or without
00006 * modification, are permitted provided that the following conditions are met:
00007 *
00008 *  - Redistributions of source code must retain the above copyright notice,
00009 *    this list of conditions and the following disclaimer.
00010 *
00011 *  - Redistributions in binary form must reproduce the above copyright notice,
00012 *    this list of conditions and the following disclaimer in the documentation
00013 *    and/or other materials provided with the distribution.
00014 *
00015 *  - Neither the name of Vintela, Inc., Novell, Inc., nor the names of its
00016 *    contributors may be used to endorse or promote products derived from this
00017 *    software without specific prior written permission.
00018 *
00019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
00020 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00021 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00022 * ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc., Novell, Inc., OR THE
00023 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00024 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00025 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
00026 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00027 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
00028 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
00029 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030 *******************************************************************************/
00031 
00036 // The source of the Unicode data is: http://www.unicode.org/Public/UNIDATA/
00037 
00038 #include "String.hpp"
00039 #include "Array.hpp"
00040 #include "StringStream.hpp"
00041 #include "UTF8Utils.hpp"
00042 #include <fstream>
00043 #include <iostream>
00044 #include <map>
00045 
00046 using namespace std;
00047 using namespace BLOCXX_NAMESPACE;
00048 
00049 map<UInt32, UInt32> upperMap;
00050 map<UInt32, UInt32> lowerMap;
00051 
00052 struct processLine
00053 {
00054    void operator()(const String& s) const
00055    {
00056       if (s.empty() || !isxdigit(s[0]))
00057       {
00058          cout << "skipping line\n" << s << '\n';
00059          return;
00060       }
00061 
00062       StringArray a = s.tokenize(";", String::E_DISCARD_DELIMITERS, String::E_RETURN_EMPTY_TOKENS); // split up fields
00063       assert(a.size() >= 14);
00064       UInt32 c1 = a[0].toUInt32(16);
00065       if (a[13] != "")
00066       {
00067          lowerMap[c1] = a[13].toUInt32(16);
00068       }
00069       if (a[12] != "")
00070       {
00071          upperMap[c1] = a[12].toUInt32(16);
00072       }
00073    }
00074 };
00075 
00076 int utf8len(UInt32 ucs4char)
00077 {
00078    if (ucs4char < 0x80u)
00079    {
00080       return 1;
00081    }
00082    else if (ucs4char < 0x800u)
00083    {
00084       return 2;
00085    }
00086    else if (ucs4char < 0x10000u)
00087    {
00088       return 3;
00089    }
00090    else
00091    {
00092       return 4;
00093    }
00094 }
00095 
00096 
00097 int main(int argc, char** argv)
00098 {
00099    ifstream in("UnicodeData.txt");
00100    if (!in)
00101    {
00102       cerr << "could not open UnicodeData.txt" << endl;
00103       return 1;
00104    }
00105 
00106    // read in a process the input file
00107    OStringStream ss;
00108    ss << in.rdbuf();
00109    String s = ss.toString();
00110    StringArray sa = s.tokenize("\n");
00111    for_each(sa.begin(), sa.end(), processLine());
00112    cout << 
00113       "struct CaseMapping\n"
00114       "{\n"
00115       "\tUInt32 codePoint;\n"
00116       "\tUInt32 mapping;\n"
00117       "};\n";
00118    cout <<
00119       "const CaseMapping lowerMappings[] =\n"
00120       "{\n";
00121    for (map<UInt32, UInt32>::const_iterator i = lowerMap.begin(); i != lowerMap.end(); ++i)
00122    {
00123       cout << hex << "\t{0x" << i->first << ", 0x" << i->second << "},";
00124       if (utf8len(i->first) < utf8len(i->second))
00125       {
00126          // do this to see if there are any utf8 sequences that would grow when lower-casing them.
00127          cout << " // increasing utf8 length";
00128       }
00129       else if (utf8len(i->first) > utf8len(i->second))
00130       {
00131          // do this to see if there are any utf8 sequences that would grow when lower-casing them.
00132          cout << " // decreasing utf8 length";
00133       }
00134       cout << "\n";
00135    }
00136    cout << "};\n\n";
00137    cout << 
00138       "const CaseMapping upperMappings[] =\n"
00139       "{\n";
00140    for (map<UInt32, UInt32>::const_iterator i = upperMap.begin(); i != upperMap.end(); ++i)
00141    {
00142       cout << hex << "\t{0x" << i->first << ", 0x" << i->second << "},";
00143       if (utf8len(i->first) < utf8len(i->second))
00144       {
00145          // do this to see if there are any utf8 sequences that would grow when lower-casing them.
00146          cout << " // increasing utf8 length";
00147       }
00148       else if (utf8len(i->first) > utf8len(i->second))
00149       {
00150          // do this to see if there are any utf8 sequences that would grow when lower-casing them.
00151          cout << " // decreasing utf8 length";
00152       }
00153       cout << "\n";
00154    }
00155    cout << "};\n";
00156 }
00157 

Generated on Mon Sep 12 23:56:34 2005 for blocxx by  doxygen 1.4.4