• Skip to content
  • Skip to link menu
KDE 4.6 API Reference
  • KDE API Reference
  • kdelibs
  • KDE Home
  • Contact Us
 

KHTML

khtml_filter.cpp

Go to the documentation of this file.
00001 /* This file is part of the KDE project
00002 
00003    Copyright (C) 2005 Ivor Hewitt     <ivor@kde.org>
00004    Copyright (C) 2008 Maksim Orlovich <maksim@kde.org>
00005    Copyright (C) 2008 Vyacheslav Tokarev <tsjoker@gmail.com>
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Library General Public
00009    License as published by the Free Software Foundation; either
00010    version 2 of the License, or (at your option) any later version.
00011 
00012    This library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Library General Public License for more details.
00016 
00017    You should have received a copy of the GNU Library General Public License
00018    along with this library; see the file COPYING.LIB.  If not, write to
00019    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00020    Boston, MA 02110-1301, USA.
00021 */
00022 
00023 #include "khtml_filter_p.h"
00024 #include <QDebug>
00025 
00026 // rolling hash parameters
00027 #define HASH_P (1997)
00028 #define HASH_Q (17509)
00029 // HASH_MOD = (HASH_P^7) % HASH_Q
00030 #define HASH_MOD (523)
00031 
00032 namespace khtml {
00033 
00034 // We only want a subset of features of wildcards -- just the 
00035 // star, so we escape the rest before passing to QRegExp. 
00036 // The \ is escaped due to a QRegExp bug.
00037 // ### we really should rather parse it ourselves, in order to 
00038 // handle adblock-special things like | and ^ properly.
00039 static QRegExp fromAdBlockWildcard(const QString& wcStr) {
00040     QRegExp rx;
00041     rx.setPatternSyntax(QRegExp::Wildcard);
00042 
00043     QString out;
00044     for (int p = 0; p < wcStr.length(); ++p) {
00045         QChar c = wcStr[p];
00046         if (c == QLatin1Char('?'))
00047             out += QLatin1String("[?]");
00048         else if (c == QLatin1Char('['))
00049             out += QLatin1String("[[]");
00050         else if (c == QLatin1Char('\\'))
00051             out += QLatin1String("[\\]");
00052         else
00053             out += c;
00054     }
00055     
00056     rx.setPattern(out);
00057     return rx;
00058 }
00059 
00060 void FilterSet::addFilter(const QString& filterStr)
00061 {
00062     QString filter = filterStr;
00063 
00065     QChar firstChar = filter.at(0);
00066     if (firstChar == QLatin1Char('[') || firstChar == QLatin1Char('!') || firstChar == QLatin1Char('&') || firstChar == QLatin1Char('#') || filter.contains(QLatin1Char('#')))
00067         return;
00068 
00069     // Strip leading @@
00070     int first = 0;
00071     int last  = filter.length() - 1;
00072     if (filter.startsWith(QLatin1String("@@")))
00073         first = 2;
00074 
00075     // Strip options, we ignore them for now.
00076     int dollar = filter.lastIndexOf(QLatin1Char('$'));
00077     if (dollar != -1)
00078         last = dollar - 1;
00079 
00080     // Perhaps nothing left?
00081     if (first > last)
00082         return;
00083 
00084     filter = filter.mid(first, last - first + 1);
00085 
00086     // Is it a regexp filter?
00087     if (filter.length()>2 && filter.startsWith(QLatin1Char('/')) && filter.endsWith(QLatin1Char('/')))
00088     {
00089         QString inside = filter.mid(1, filter.length()-2);
00090         QRegExp rx(inside);
00091         reFilters.append(rx);
00092 //         qDebug() << "R:" << inside;
00093     }
00094     else
00095     {
00096         // Nope, a wildcard one.
00097         // Note: For these, we also need to handle |.
00098 
00099         // Strip wildcards at the ends
00100         first = 0;
00101         last  = filter.length() - 1;
00102 
00103         while (first < filter.length() && filter[first] == QLatin1Char('*'))
00104             ++first;
00105 
00106         while (last >= 0 && filter[last] == QLatin1Char('*'))
00107             --last;
00108 
00109         if (first > last)
00110             filter = QLatin1String("*"); // erm... Well, they asked for it.
00111         else
00112             filter = filter.mid(first, last - first + 1);
00113 
00114         // Now, do we still have any wildcard stuff left?
00115         if (filter.contains("*"))
00116         {
00117             // check if we can use RK first (and then check full RE for the rest) for better performance
00118             int aPos = filter.indexOf('*');
00119             if (aPos < 0)
00120                 aPos = filter.length();
00121             if (aPos > 7) {
00122                 QRegExp rx = fromAdBlockWildcard(filter.mid(aPos) + QLatin1Char('*'));
00123                 // We pad the final r.e. with * so we can check for an exact match
00124                 stringFiltersMatcher.addWildedString(filter.mid(0, aPos), rx);
00125             } else {
00126                 QRegExp rx = fromAdBlockWildcard(filter);
00127                 reFilters.append(rx);
00128             }
00129         }
00130         else
00131         {
00132             // Fast path
00133             stringFiltersMatcher.addString(filter);
00134         }
00135     }
00136 }
00137 
00138 bool FilterSet::isUrlMatched(const QString& url)
00139 {
00140     if (stringFiltersMatcher.isMatched(url))
00141         return true;
00142 
00143     for (int c = 0; c < reFilters.size(); ++c)
00144     {
00145         if (url.contains(reFilters[c]))
00146             return true;
00147     }
00148 
00149     return false;
00150 }
00151 
00152 QString FilterSet::urlMatchedBy(const QString& url)
00153 {
00154     QString by;
00155 
00156     if (stringFiltersMatcher.isMatched(url, &by))
00157         return by;
00158 
00159     for (int c = 0; c < reFilters.size(); ++c)
00160     {
00161         if (url.contains(reFilters[c]))
00162         {
00163             by = reFilters[c].pattern();
00164             break;
00165         }
00166     }
00167 
00168     return by;
00169 }
00170 
00171 void FilterSet::clear()
00172 {
00173     reFilters.clear();
00174     stringFiltersMatcher.clear();
00175 }
00176 
00177 
00178 void StringsMatcher::addString(const QString& pattern)
00179 {
00180     if (pattern.length() < 8) {
00181         // handle short string differently
00182         shortStringFilters.append(pattern);
00183     } else {
00184         // use modified Rabin-Karp's algorithm with 8-length string hash
00185         // i.e. store hash of first 8 chars in the HashMap for fast look-up
00186         stringFilters.append(pattern);
00187         int ind = stringFilters.size() - 1;
00188         int current = 0;
00189 
00190         // compute hash using rolling hash
00191         // hash for string: x0,x1,x2...xn-1 will be:
00192         // (p^(n-1)*x0 + p^(n-2)*x1 + ... + p * xn-2 + xn-1) % q
00193         // where p and q some wisely-chosen integers
00194         /*for (int k = 0; k < 8; ++k)*/
00195         int len = pattern.length();
00196         for (int k = len - 8; k < len; ++k)
00197             current = (current * HASH_P + pattern[k].unicode()) % HASH_Q;
00198 
00199         // insert computed hash value into HashMap
00200         WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
00201         if (it == stringFiltersHash.end()) {
00202             QVector<int> list;
00203             list.append(ind);
00204             stringFiltersHash.add(current + 1, list);
00205             fastLookUp.setBit(current);
00206         } else {
00207             it->second.append(ind);
00208         }
00209     }
00210 }
00211 
00212 void StringsMatcher::addWildedString(const QString& prefix, const QRegExp& rx)
00213 {
00214     rePrefixes.append(prefix);
00215     reFilters.append(rx);
00216     int index = -rePrefixes.size();
00217 
00218     int current = 0;
00219     for (int k = 0; k < 8; ++k)
00220         current = (current * HASH_P + prefix[k].unicode()) % HASH_Q;
00221 
00222     // insert computed hash value into HashMap
00223     WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
00224     if (it == stringFiltersHash.end()) {
00225         QVector<int> list;
00226         list.append(index);
00227         stringFiltersHash.add(current + 1, list);
00228         fastLookUp.setBit(current);
00229     } else {
00230         it->second.append(index);
00231     }
00232 }
00233 
00234 bool StringsMatcher::isMatched(const QString& str, QString *by) const
00235 {
00236     // check short strings first
00237     for (int i = 0; i < shortStringFilters.size(); ++i) {
00238         if (str.contains(shortStringFilters[i]))
00239         {
00240             if (by != 0) *by = shortStringFilters[i];
00241             return true;
00242         }
00243     }
00244 
00245     int len = str.length();
00246     int k;
00247 
00248     int current = 0;
00249     int next = 0;
00250     // compute hash for first 8 characters
00251     for (k = 0; k < 8 && k < len; ++k)
00252         current = (current * HASH_P + str[k].unicode()) % HASH_Q;
00253 
00254     WTF::HashMap<int, QVector<int> >::const_iterator hashEnd = stringFiltersHash.end();
00255     // main Rabin-Karp's algorithm loop
00256     for (k = 7; k < len; ++k, current = next) {
00257         // roll the hash if not at the end
00258         // (calculate hash for the next iteration)
00259         if (k + 1 < len)
00260             next = (HASH_P * ((current + HASH_Q - ((HASH_MOD * str[k - 7].unicode()) % HASH_Q)) % HASH_Q) + str[k + 1].unicode()) % HASH_Q;
00261 
00262         if (!fastLookUp.testBit(current))
00263             continue;
00264 
00265         // look-up the hash in the HashMap and check all strings
00266         WTF::HashMap<int, QVector<int> >::const_iterator it = stringFiltersHash.find(current + 1);
00267 
00268         // check possible strings
00269         if (it != hashEnd) {
00270             for (int j = 0; j < it->second.size(); ++j) {
00271                 int index = it->second[j];
00272                 // check if we got simple string or REs prefix
00273                 if (index >= 0) {
00274                     int flen = stringFilters[index].length();
00275                     if (k - flen + 1 >= 0 && stringFilters[index] == str.midRef(k - flen + 1 , flen))
00276                     {
00277                         if (by != 0) *by = stringFilters[index];
00278                         return true;
00279                     }
00280                 } else {
00281                     index = -index - 1;
00282                     int flen = rePrefixes[index].length();
00283                     if (k - 8 + flen < len && rePrefixes[index] == str.midRef(k - 7, flen))
00284                     {
00285                         int remStart = k - 7 + flen;
00286                         QString remainder = QString::fromRawData(str.unicode() + remStart,
00287                                                                  str.length() - remStart);
00288                         if (reFilters[index].exactMatch(remainder)) {
00289                             if (by != 0) *by = rePrefixes[index]+reFilters[index].pattern();
00290                             return true;
00291                         }
00292                     }
00293                 }
00294             }
00295         }
00296     }
00297 
00298     return false;
00299 }
00300 
00301 void StringsMatcher::clear()
00302 {
00303     stringFilters.clear();
00304     shortStringFilters.clear();
00305     reFilters.clear();
00306     rePrefixes.clear();
00307     stringFiltersHash.clear();
00308     fastLookUp.resize(HASH_Q);
00309     fastLookUp.fill(0, 0, HASH_Q);
00310 }
00311 
00312 }
00313 
00314 // kate: indent-width 4; replace-tabs on; tab-width 4; space-indent on;

KHTML

Skip menu "KHTML"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.7.3
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal