00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00036
00037
00038 #include "String.hpp"
00039 #include "Array.hpp"
00040 #include "StringStream.hpp"
00041 #include "UTF8Utils.hpp"
00042 #include <fstream>
00043 #include <iostream>
00044 #include <map>
00045
00046 using namespace std;
00047 using namespace BLOCXX_NAMESPACE;
00048
00049 map<UInt32, UInt32> upperMap;
00050 map<UInt32, UInt32> lowerMap;
00051
00052 struct processLine
00053 {
00054 void operator()(const String& s) const
00055 {
00056 if (s.empty() || !isxdigit(s[0]))
00057 {
00058 cout << "skipping line\n" << s << '\n';
00059 return;
00060 }
00061
00062 StringArray a = s.tokenize(";", String::E_DISCARD_DELIMITERS, String::E_RETURN_EMPTY_TOKENS);
00063 assert(a.size() >= 14);
00064 UInt32 c1 = a[0].toUInt32(16);
00065 if (a[13] != "")
00066 {
00067 lowerMap[c1] = a[13].toUInt32(16);
00068 }
00069 if (a[12] != "")
00070 {
00071 upperMap[c1] = a[12].toUInt32(16);
00072 }
00073 }
00074 };
00075
00076 int utf8len(UInt32 ucs4char)
00077 {
00078 if (ucs4char < 0x80u)
00079 {
00080 return 1;
00081 }
00082 else if (ucs4char < 0x800u)
00083 {
00084 return 2;
00085 }
00086 else if (ucs4char < 0x10000u)
00087 {
00088 return 3;
00089 }
00090 else
00091 {
00092 return 4;
00093 }
00094 }
00095
00096
00097 int main(int argc, char** argv)
00098 {
00099 ifstream in("UnicodeData.txt");
00100 if (!in)
00101 {
00102 cerr << "could not open UnicodeData.txt" << endl;
00103 return 1;
00104 }
00105
00106
00107 OStringStream ss;
00108 ss << in.rdbuf();
00109 String s = ss.toString();
00110 StringArray sa = s.tokenize("\n");
00111 for_each(sa.begin(), sa.end(), processLine());
00112 cout <<
00113 "struct CaseMapping\n"
00114 "{\n"
00115 "\tUInt32 codePoint;\n"
00116 "\tUInt32 mapping;\n"
00117 "};\n";
00118 cout <<
00119 "const CaseMapping lowerMappings[] =\n"
00120 "{\n";
00121 for (map<UInt32, UInt32>::const_iterator i = lowerMap.begin(); i != lowerMap.end(); ++i)
00122 {
00123 cout << hex << "\t{0x" << i->first << ", 0x" << i->second << "},";
00124 if (utf8len(i->first) < utf8len(i->second))
00125 {
00126
00127 cout << " // increasing utf8 length";
00128 }
00129 else if (utf8len(i->first) > utf8len(i->second))
00130 {
00131
00132 cout << " // decreasing utf8 length";
00133 }
00134 cout << "\n";
00135 }
00136 cout << "};\n\n";
00137 cout <<
00138 "const CaseMapping upperMappings[] =\n"
00139 "{\n";
00140 for (map<UInt32, UInt32>::const_iterator i = upperMap.begin(); i != upperMap.end(); ++i)
00141 {
00142 cout << hex << "\t{0x" << i->first << ", 0x" << i->second << "},";
00143 if (utf8len(i->first) < utf8len(i->second))
00144 {
00145
00146 cout << " // increasing utf8 length";
00147 }
00148 else if (utf8len(i->first) > utf8len(i->second))
00149 {
00150
00151 cout << " // decreasing utf8 length";
00152 }
00153 cout << "\n";
00154 }
00155 cout << "};\n";
00156 }
00157