KHTML
khtml_filter.cpp
Go to the documentation of this file.
00001 /* This file is part of the KDE project 00002 00003 Copyright (C) 2005 Ivor Hewitt <ivor@kde.org> 00004 Copyright (C) 2008 Maksim Orlovich <maksim@kde.org> 00005 Copyright (C) 2008 Vyacheslav Tokarev <tsjoker@gmail.com> 00006 00007 This library is free software; you can redistribute it and/or 00008 modify it under the terms of the GNU Library General Public 00009 License as published by the Free Software Foundation; either 00010 version 2 of the License, or (at your option) any later version. 00011 00012 This library is distributed in the hope that it will be useful, 00013 but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 Library General Public License for more details. 00016 00017 You should have received a copy of the GNU Library General Public License 00018 along with this library; see the file COPYING.LIB. If not, write to 00019 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00020 Boston, MA 02110-1301, USA. 00021 */ 00022 00023 #include "khtml_filter_p.h" 00024 #include <QDebug> 00025 00026 // rolling hash parameters 00027 #define HASH_P (1997) 00028 #define HASH_Q (17509) 00029 // HASH_MOD = (HASH_P^7) % HASH_Q 00030 #define HASH_MOD (523) 00031 00032 namespace khtml { 00033 00034 // We only want a subset of features of wildcards -- just the 00035 // star, so we escape the rest before passing to QRegExp. 00036 // The \ is escaped due to a QRegExp bug. 00037 // ### we really should rather parse it ourselves, in order to 00038 // handle adblock-special things like | and ^ properly. 00039 static QRegExp fromAdBlockWildcard(const QString& wcStr) { 00040 QRegExp rx; 00041 rx.setPatternSyntax(QRegExp::Wildcard); 00042 00043 QString out; 00044 for (int p = 0; p < wcStr.length(); ++p) { 00045 QChar c = wcStr[p]; 00046 if (c == QLatin1Char('?')) 00047 out += QLatin1String("[?]"); 00048 else if (c == QLatin1Char('[')) 00049 out += QLatin1String("[[]"); 00050 else if (c == QLatin1Char('\\')) 00051 out += QLatin1String("[\\]"); 00052 else 00053 out += c; 00054 } 00055 00056 rx.setPattern(out); 00057 return rx; 00058 } 00059 00060 void FilterSet::addFilter(const QString& filterStr) 00061 { 00062 QString filter = filterStr; 00063 00065 QChar firstChar = filter.at(0); 00066 if (firstChar == QLatin1Char('[') || firstChar == QLatin1Char('!') || firstChar == QLatin1Char('&') || firstChar == QLatin1Char('#') || filter.contains(QLatin1Char('#'))) 00067 return; 00068 00069 // Strip leading @@ 00070 int first = 0; 00071 int last = filter.length() - 1; 00072 if (filter.startsWith(QLatin1String("@@"))) 00073 first = 2; 00074 00075 // Strip options, we ignore them for now. 00076 int dollar = filter.lastIndexOf(QLatin1Char('$')); 00077 if (dollar != -1) 00078 last = dollar - 1; 00079 00080 // Perhaps nothing left? 00081 if (first > last) 00082 return; 00083 00084 filter = filter.mid(first, last - first + 1); 00085 00086 // Is it a regexp filter? 00087 if (filter.length()>2 && filter.startsWith(QLatin1Char('/')) && filter.endsWith(QLatin1Char('/'))) 00088 { 00089 QString inside = filter.mid(1, filter.length()-2); 00090 QRegExp rx(inside); 00091 reFilters.append(rx); 00092 // qDebug() << "R:" << inside; 00093 } 00094 else 00095 { 00096 // Nope, a wildcard one. 00097 // Note: For these, we also need to handle |. 00098 00099 // Strip wildcards at the ends 00100 first = 0; 00101 last = filter.length() - 1; 00102 00103 while (first < filter.length() && filter[first] == QLatin1Char('*')) 00104 ++first; 00105 00106 while (last >= 0 && filter[last] == QLatin1Char('*')) 00107 --last; 00108 00109 if (first > last) 00110 filter = QLatin1String("*"); // erm... Well, they asked for it. 00111 else 00112 filter = filter.mid(first, last - first + 1); 00113 00114 // Now, do we still have any wildcard stuff left? 00115 if (filter.contains("*")) 00116 { 00117 // check if we can use RK first (and then check full RE for the rest) for better performance 00118 int aPos = filter.indexOf('*'); 00119 if (aPos < 0) 00120 aPos = filter.length(); 00121 if (aPos > 7) { 00122 QRegExp rx = fromAdBlockWildcard(filter.mid(aPos) + QLatin1Char('*')); 00123 // We pad the final r.e. with * so we can check for an exact match 00124 stringFiltersMatcher.addWildedString(filter.mid(0, aPos), rx); 00125 } else { 00126 QRegExp rx = fromAdBlockWildcard(filter); 00127 reFilters.append(rx); 00128 } 00129 } 00130 else 00131 { 00132 // Fast path 00133 stringFiltersMatcher.addString(filter); 00134 } 00135 } 00136 } 00137 00138 bool FilterSet::isUrlMatched(const QString& url) 00139 { 00140 if (stringFiltersMatcher.isMatched(url)) 00141 return true; 00142 00143 for (int c = 0; c < reFilters.size(); ++c) 00144 { 00145 if (url.contains(reFilters[c])) 00146 return true; 00147 } 00148 00149 return false; 00150 } 00151 00152 QString FilterSet::urlMatchedBy(const QString& url) 00153 { 00154 QString by; 00155 00156 if (stringFiltersMatcher.isMatched(url, &by)) 00157 return by; 00158 00159 for (int c = 0; c < reFilters.size(); ++c) 00160 { 00161 if (url.contains(reFilters[c])) 00162 { 00163 by = reFilters[c].pattern(); 00164 break; 00165 } 00166 } 00167 00168 return by; 00169 } 00170 00171 void FilterSet::clear() 00172 { 00173 reFilters.clear(); 00174 stringFiltersMatcher.clear(); 00175 } 00176 00177 00178 void StringsMatcher::addString(const QString& pattern) 00179 { 00180 if (pattern.length() < 8) { 00181 // handle short string differently 00182 shortStringFilters.append(pattern); 00183 } else { 00184 // use modified Rabin-Karp's algorithm with 8-length string hash 00185 // i.e. store hash of first 8 chars in the HashMap for fast look-up 00186 stringFilters.append(pattern); 00187 int ind = stringFilters.size() - 1; 00188 int current = 0; 00189 00190 // compute hash using rolling hash 00191 // hash for string: x0,x1,x2...xn-1 will be: 00192 // (p^(n-1)*x0 + p^(n-2)*x1 + ... + p * xn-2 + xn-1) % q 00193 // where p and q some wisely-chosen integers 00194 /*for (int k = 0; k < 8; ++k)*/ 00195 int len = pattern.length(); 00196 for (int k = len - 8; k < len; ++k) 00197 current = (current * HASH_P + pattern[k].unicode()) % HASH_Q; 00198 00199 // insert computed hash value into HashMap 00200 WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1); 00201 if (it == stringFiltersHash.end()) { 00202 QVector<int> list; 00203 list.append(ind); 00204 stringFiltersHash.add(current + 1, list); 00205 fastLookUp.setBit(current); 00206 } else { 00207 it->second.append(ind); 00208 } 00209 } 00210 } 00211 00212 void StringsMatcher::addWildedString(const QString& prefix, const QRegExp& rx) 00213 { 00214 rePrefixes.append(prefix); 00215 reFilters.append(rx); 00216 int index = -rePrefixes.size(); 00217 00218 int current = 0; 00219 for (int k = 0; k < 8; ++k) 00220 current = (current * HASH_P + prefix[k].unicode()) % HASH_Q; 00221 00222 // insert computed hash value into HashMap 00223 WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1); 00224 if (it == stringFiltersHash.end()) { 00225 QVector<int> list; 00226 list.append(index); 00227 stringFiltersHash.add(current + 1, list); 00228 fastLookUp.setBit(current); 00229 } else { 00230 it->second.append(index); 00231 } 00232 } 00233 00234 bool StringsMatcher::isMatched(const QString& str, QString *by) const 00235 { 00236 // check short strings first 00237 for (int i = 0; i < shortStringFilters.size(); ++i) { 00238 if (str.contains(shortStringFilters[i])) 00239 { 00240 if (by != 0) *by = shortStringFilters[i]; 00241 return true; 00242 } 00243 } 00244 00245 int len = str.length(); 00246 int k; 00247 00248 int current = 0; 00249 int next = 0; 00250 // compute hash for first 8 characters 00251 for (k = 0; k < 8 && k < len; ++k) 00252 current = (current * HASH_P + str[k].unicode()) % HASH_Q; 00253 00254 WTF::HashMap<int, QVector<int> >::const_iterator hashEnd = stringFiltersHash.end(); 00255 // main Rabin-Karp's algorithm loop 00256 for (k = 7; k < len; ++k, current = next) { 00257 // roll the hash if not at the end 00258 // (calculate hash for the next iteration) 00259 if (k + 1 < len) 00260 next = (HASH_P * ((current + HASH_Q - ((HASH_MOD * str[k - 7].unicode()) % HASH_Q)) % HASH_Q) + str[k + 1].unicode()) % HASH_Q; 00261 00262 if (!fastLookUp.testBit(current)) 00263 continue; 00264 00265 // look-up the hash in the HashMap and check all strings 00266 WTF::HashMap<int, QVector<int> >::const_iterator it = stringFiltersHash.find(current + 1); 00267 00268 // check possible strings 00269 if (it != hashEnd) { 00270 for (int j = 0; j < it->second.size(); ++j) { 00271 int index = it->second[j]; 00272 // check if we got simple string or REs prefix 00273 if (index >= 0) { 00274 int flen = stringFilters[index].length(); 00275 if (k - flen + 1 >= 0 && stringFilters[index] == str.midRef(k - flen + 1 , flen)) 00276 { 00277 if (by != 0) *by = stringFilters[index]; 00278 return true; 00279 } 00280 } else { 00281 index = -index - 1; 00282 int flen = rePrefixes[index].length(); 00283 if (k - 8 + flen < len && rePrefixes[index] == str.midRef(k - 7, flen)) 00284 { 00285 int remStart = k - 7 + flen; 00286 QString remainder = QString::fromRawData(str.unicode() + remStart, 00287 str.length() - remStart); 00288 if (reFilters[index].exactMatch(remainder)) { 00289 if (by != 0) *by = rePrefixes[index]+reFilters[index].pattern(); 00290 return true; 00291 } 00292 } 00293 } 00294 } 00295 } 00296 } 00297 00298 return false; 00299 } 00300 00301 void StringsMatcher::clear() 00302 { 00303 stringFilters.clear(); 00304 shortStringFilters.clear(); 00305 reFilters.clear(); 00306 rePrefixes.clear(); 00307 stringFiltersHash.clear(); 00308 fastLookUp.resize(HASH_Q); 00309 fastLookUp.fill(0, 0, HASH_Q); 00310 } 00311 00312 } 00313 00314 // kate: indent-width 4; replace-tabs on; tab-width 4; space-indent on;
KDE 4.6 API Reference