-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Parsing and extracting information from (possibly malformed) HTML/XML documents
--   
--   TagSoup is a library for parsing HTML/XML. It supports the HTML 5
--   specification, and can be used to parse either well-formed XML, or
--   unstructured and malformed HTML from the web. The library also
--   provides useful functions to extract information from an HTML
--   document, making it ideal for screen-scraping.
--   
--   Users should start from the <a>Text.HTML.TagSoup</a> module.
@package tagsoup
@version 0.12.8


-- | This module converts between HTML/XML entities (i.e.
--   <tt>&amp;amp;</tt>) and the characters they represent.
module Text.HTML.TagSoup.Entity

-- | Lookup an entity, using <a>lookupNumericEntity</a> if it starts with
--   <tt>#</tt> and <a>lookupNamedEntity</a> otherwise
lookupEntity :: String -> Maybe Char

-- | Lookup a named entity, using <a>htmlEntities</a>
--   
--   <pre>
--   lookupNamedEntity "amp" == Just '&amp;'
--   lookupNamedEntity "haskell" == Nothing
--   </pre>
lookupNamedEntity :: String -> Maybe Char

-- | Lookup a numeric entity, the leading <tt>'#'</tt> must have already
--   been removed.
--   
--   <pre>
--   lookupNumericEntity "65" == Just 'A'
--   lookupNumericEntity "x41" == Just 'A'
--   lookupNumericEntity "x4E" === Just 'N'
--   lookupNumericEntity "x4e" === Just 'N'
--   lookupNumericEntity "Haskell" == Nothing
--   lookupNumericEntity "" == Nothing
--   lookupNumericEntity "89439085908539082" == Nothing
--   </pre>
lookupNumericEntity :: String -> Maybe Char

-- | Escape a character before writing it out to XML.
--   
--   <pre>
--   escapeXMLChar 'a' == Nothing
--   escapeXMLChar '&amp;' == Just "amp"
--   </pre>
escapeXMLChar :: Char -> Maybe String

-- | A table mapping XML entity names to code points. Does <i>not</i>
--   include <tt>apos</tt> as Internet Explorer does not know about it.
xmlEntities :: [(String, Int)]

-- | A table mapping HTML entity names to code points
htmlEntities :: [(String, Int)]


-- | <i>WARNING</i>: This module is <i>not</i> intended for use outside the
--   TagSoup library.
--   
--   This module provides an abstraction for String's as used inside
--   TagSoup. It allows TagSoup to work with String (list of Char),
--   ByteString.Char8, ByteString.Lazy.Char8, Data.Text and Data.Text.Lazy.
module Text.StringLike

-- | A class to generalise TagSoup parsing over many types of string-like
--   types. Examples are given for the String type.
class (Typeable a, Eq a) => StringLike a
empty :: StringLike a => a
cons :: StringLike a => Char -> a -> a
uncons :: StringLike a => a -> Maybe (Char, a)
toString :: StringLike a => a -> String
fromString :: StringLike a => String -> a
fromChar :: StringLike a => Char -> a
strConcat :: StringLike a => [a] -> a
strNull :: StringLike a => a -> Bool
append :: StringLike a => a -> a -> a

-- | Convert a String from one type to another.
castString :: (StringLike a, StringLike b) => a -> b
instance StringLike Text
instance StringLike Text
instance StringLike ByteString
instance StringLike ByteString
instance StringLike String


-- | Combinators to match tags. Some people prefer to use <tt>(~==)</tt>
--   from <a>Text.HTML.TagSoup</a>, others prefer these more structured
--   combinators. Which you use is personal preference.
module Text.HTML.TagSoup.Match

-- | match an opening tag
tagOpen :: (str -> Bool) -> ([Attribute str] -> Bool) -> Tag str -> Bool

-- | match an closing tag
tagClose :: (str -> Bool) -> Tag str -> Bool

-- | match a text
tagText :: (str -> Bool) -> Tag str -> Bool
tagComment :: (str -> Bool) -> Tag str -> Bool

-- | match a opening tag's name literally
tagOpenLit :: Eq str => str -> ([Attribute str] -> Bool) -> Tag str -> Bool

-- | match a closing tag's name literally
tagCloseLit :: Eq str => str -> Tag str -> Bool
tagOpenAttrLit :: Eq str => str -> Attribute str -> Tag str -> Bool

-- | Match a tag with given name, that contains an attribute with given
--   name, that satisfies a predicate. If an attribute occurs multiple
--   times, all occurrences are checked.
tagOpenAttrNameLit :: Eq str => str -> str -> (str -> Bool) -> Tag str -> Bool

-- | Check if the 'Tag str' is <a>TagOpen</a> and matches the given name
tagOpenNameLit :: Eq str => str -> Tag str -> Bool

-- | Check if the 'Tag str' is <a>TagClose</a> and matches the given name
tagCloseNameLit :: Eq str => str -> Tag str -> Bool
anyAttr :: ((str, str) -> Bool) -> [Attribute str] -> Bool
anyAttrName :: (str -> Bool) -> [Attribute str] -> Bool
anyAttrValue :: (str -> Bool) -> [Attribute str] -> Bool
anyAttrLit :: Eq str => (str, str) -> [Attribute str] -> Bool
anyAttrNameLit :: Eq str => str -> [Attribute str] -> Bool
anyAttrValueLit :: Eq str => str -> [Attribute str] -> Bool
getTagContent :: Eq str => str -> ([Attribute str] -> Bool) -> [Tag str] -> [Tag str]


-- | <i>NOTE</i>: This module is preliminary and may change at a future
--   date.
--   
--   This module is intended to help converting a list of tags into a tree
--   of tags.
module Text.HTML.TagSoup.Tree
data TagTree str
TagBranch :: str -> [Attribute str] -> [TagTree str] -> TagTree str
TagLeaf :: (Tag str) -> TagTree str

-- | Convert a list of tags into a tree. This version is not lazy at all,
--   that is saved for version 2.
tagTree :: Eq str => [Tag str] -> [TagTree str]
flattenTree :: [TagTree str] -> [Tag str]

-- | This operation is based on the Uniplate <tt>transform</tt> function.
--   Given a list of trees, it applies the function to every tree in a
--   bottom-up manner. This operation is useful for manipulating a tree -
--   for example to make all tag names upper case:
--   
--   <pre>
--   upperCase = transformTree f
--     where f (TagBranch name atts inner) = [TagBranch (map toUpper name) atts inner]
--           f x = x
--   </pre>
transformTree :: (TagTree str -> [TagTree str]) -> [TagTree str] -> [TagTree str]

-- | This operation is based on the Uniplate <tt>universe</tt> function.
--   Given a list of trees, it returns those trees, and all the children
--   trees at any level. For example:
--   
--   <pre>
--   universeTree
--      [TagBranch "a" [("href","url")] [TagBranch "b" [] [TagLeaf (TagText "text")]]]
--   == [TagBranch "a" [("href","url")] [TagBranch "b" [] [TagLeaf (TagText "text")]]]
--      ,TagBranch "b" [] [TagLeaf (TagText "text")]]
--   </pre>
--   
--   This operation is particularly useful for queries. To collect all
--   <tt>"a"</tt> tags in a tree, simply do:
--   
--   <pre>
--   [x | x@(TagTree "a" _ _) &lt;- universeTree tree]
--   </pre>
universeTree :: [TagTree str] -> [TagTree str]
instance Eq str => Eq (TagTree str)
instance Ord str => Ord (TagTree str)
instance Show str => Show (TagTree str)
instance Functor TagTree


-- | This module is for working with HTML/XML. It deals with both
--   well-formed XML and malformed HTML from the web. It features:
--   
--   <ul>
--   <li>A lazy parser, based on the HTML 5 specification - see
--   <a>parseTags</a>.</li>
--   <li>A renderer that can write out HTML/XML - see
--   <a>renderTags</a>.</li>
--   <li>Utilities for extracting information from a document - see
--   <a>~==</a>, <a>sections</a> and <a>partitions</a>.</li>
--   </ul>
--   
--   The standard practice is to parse a <a>String</a> to
--   <tt>[</tt><a>Tag</a> <a>String</a><tt>]</tt> using <a>parseTags</a>,
--   then operate upon it to extract the necessary information.
module Text.HTML.TagSoup

-- | A single HTML element. A whole document is represented by a list of
--   <tt>Tag</tt>. There is no requirement for <a>TagOpen</a> and
--   <a>TagClose</a> to match.
data Tag str

-- | An open tag with <a>Attribute</a>s in their original order
TagOpen :: str -> [Attribute str] -> Tag str

-- | A closing tag
TagClose :: str -> Tag str

-- | A text node, guaranteed not to be the empty string
TagText :: str -> Tag str

-- | A comment
TagComment :: str -> Tag str

-- | Meta: A syntax error in the input file
TagWarning :: str -> Tag str

-- | Meta: The position of a parsed element
TagPosition :: !Row -> !Column -> Tag str

-- | The row/line of a position, starting at 1
type Row = Int

-- | The column of a position, starting at 1
type Column = Int

-- | An HTML attribute <tt>id="name"</tt> generates <tt>("id","name")</tt>
type Attribute str = (str, str)

-- | Parse a string to a list of tags, using an HTML 5 compliant parser.
--   
--   <pre>
--   parseTags "&lt;hello&gt;my&amp;amp;&lt;/world&gt;" == [TagOpen "hello" [],TagText "my&amp;",TagClose "world"]
--   </pre>
parseTags :: StringLike str => str -> [Tag str]

-- | Parse a string to a list of tags, using settings supplied by the
--   <a>ParseOptions</a> parameter, eg. to output position information:
--   
--   <pre>
--   parseTagsOptions parseOptions{optTagPosition = True} "&lt;hello&gt;my&amp;amp;&lt;/world&gt;" ==
--      [TagPosition 1 1,TagOpen "hello" [],TagPosition 1 8,TagText "my&amp;",TagPosition 1 15,TagClose "world"]
--   </pre>
parseTagsOptions :: StringLike str => ParseOptions str -> str -> [Tag str]

-- | These options control how <tt>parseTags</tt> works. The
--   <a>ParseOptions</a> type is usually generated by one of
--   <a>parseOptions</a>, <a>parseOptionsFast</a> or
--   <a>parseOptionsEntities</a>, then selected fields may be overriden.
--   
--   The options <a>optTagPosition</a> and <a>optTagWarning</a> specify
--   whether to generate <a>TagPosition</a> or <a>TagWarning</a> elements
--   respectively. Usually these options should be set to <tt>False</tt> to
--   simplify future stages, unless you rely on position information or
--   want to give malformed HTML messages to the end user.
--   
--   The options <a>optEntityData</a> and <a>optEntityAttrib</a> control
--   how entities, for example <tt>&amp;nbsp;</tt> are handled. Both take a
--   string, and a boolean, where <tt>True</tt> indicates that the entity
--   ended with a semi-colon <tt>;</tt>. Inside normal text
--   <a>optEntityData</a> will be called, and the results will be inserted
--   in the tag stream. Inside a tag attribute <a>optEntityAttrib</a> will
--   be called, and the first component of the result will be used in the
--   attribute, and the second component will be appended after the
--   <a>TagOpen</a> value (usually the second component is <tt>[]</tt>). As
--   an example, to not decode any entities, pass:
--   
--   <pre>
--   parseOptions
--       {optEntityData=\(str,b) -&gt; [TagText $ "&amp;" ++ str ++ [';' | b]]
--       ,optEntityAttrib\(str,b) -&gt; ("&amp;" ++ str ++ [';' | b], [])
--   </pre>
data ParseOptions str
ParseOptions :: Bool -> Bool -> ((str, Bool) -> [Tag str]) -> ((str, Bool) -> (str, [Tag str])) -> Bool -> ParseOptions str

-- | Should <a>TagPosition</a> values be given before some items
--   (default=False,fast=False).
optTagPosition :: ParseOptions str -> Bool

-- | Should <a>TagWarning</a> values be given (default=False,fast=False)
optTagWarning :: ParseOptions str -> Bool

-- | How to lookup an entity (Bool = has ending <tt>';'</tt>)
optEntityData :: ParseOptions str -> (str, Bool) -> [Tag str]

-- | How to lookup an entity in an attribute (Bool = has ending
--   <tt>';'</tt>?)
optEntityAttrib :: ParseOptions str -> (str, Bool) -> (str, [Tag str])

-- | Require no adjacent <a>TagText</a> values (default=True,fast=False)
optTagTextMerge :: ParseOptions str -> Bool

-- | The default parse options value, described in <a>ParseOptions</a>.
--   Equivalent to <tt><a>parseOptionsEntities</a>
--   <a>lookupEntity</a></tt>.
parseOptions :: StringLike str => ParseOptions str

-- | A <a>ParseOptions</a> structure optimised for speed, following the
--   fast options.
parseOptionsFast :: StringLike str => ParseOptions str

-- | A <a>ParseOptions</a> structure using a custom function to lookup
--   attributes. Any attribute that is not found will be left intact, and a
--   <a>TagWarning</a> given (if <a>optTagWarning</a> is set).
--   
--   If you do not want to resolve any entities, simpliy pass <tt>const
--   Nothing</tt> for the lookup function.
parseOptionsEntities :: StringLike str => (str -> Maybe str) -> ParseOptions str

-- | Show a list of tags, as they might have been parsed, using the default
--   settings given in <a>RenderOptions</a>.
--   
--   <pre>
--   renderTags [TagOpen "hello" [],TagText "my&amp;",TagClose "world"] == "&lt;hello&gt;my&amp;amp;&lt;/world&gt;"
--   </pre>
renderTags :: StringLike str => [Tag str] -> str

-- | Show a list of tags using settings supplied by the
--   <a>RenderOptions</a> parameter, eg. to avoid escaping any characters
--   one could do:
--   
--   <pre>
--   renderTagsOptions renderOptions{optEscape = id} [TagText "my&amp;"] == "my&amp;"
--   </pre>
renderTagsOptions :: StringLike str => RenderOptions str -> [Tag str] -> str

-- | Replace the four characters <tt>&amp;"&lt;&gt;</tt> with their HTML
--   entities (the list from <a>xmlEntities</a>).
escapeHTML :: StringLike str => str -> str

-- | These options control how <a>renderTags</a> works.
--   
--   The strange quirk of only minimizing <tt>&lt;br&gt;</tt> tags is due
--   to Internet Explorer treating <tt>&lt;br&gt;&lt;/br&gt;</tt> as
--   <tt>&lt;br&gt;&lt;br&gt;</tt>.
data RenderOptions str
RenderOptions :: (str -> str) -> (str -> Bool) -> (str -> Bool) -> RenderOptions str

-- | Escape a piece of text (default = escape the four characters
--   <tt>&amp;"&lt;&gt;</tt>)
optEscape :: RenderOptions str -> str -> str

-- | Minimise &lt;b&gt;&lt;/b&gt; -&gt; &lt;b/&gt; (default = minimise only
--   <tt>&lt;br&gt;</tt> tags)
optMinimize :: RenderOptions str -> str -> Bool

-- | Should a tag be output with no escaping (default = true only for
--   <tt>script</tt>)
optRawTag :: RenderOptions str -> str -> Bool

-- | The default render options value, described in <a>RenderOptions</a>.
renderOptions :: StringLike str => RenderOptions str

-- | Turns all tag names and attributes to lower case and converts DOCTYPE
--   to upper case.
canonicalizeTags :: StringLike str => [Tag str] -> [Tag str]

-- | Test if a <a>Tag</a> is a <a>TagOpen</a>
isTagOpen :: Tag str -> Bool

-- | Test if a <a>Tag</a> is a <a>TagClose</a>
isTagClose :: Tag str -> Bool

-- | Test if a <a>Tag</a> is a <a>TagText</a>
isTagText :: Tag str -> Bool

-- | Test if a <a>Tag</a> is a <a>TagWarning</a>
isTagWarning :: Tag str -> Bool

-- | Test if a <a>Tag</a> is a <a>TagPosition</a>
isTagPosition :: Tag str -> Bool

-- | Returns True if the <a>Tag</a> is <a>TagOpen</a> and matches the given
--   name
isTagOpenName :: Eq str => str -> Tag str -> Bool

-- | Returns True if the <a>Tag</a> is <a>TagClose</a> and matches the
--   given name
isTagCloseName :: Eq str => str -> Tag str -> Bool

-- | Extract the string from within <a>TagText</a>, crashes if not a
--   <a>TagText</a>
fromTagText :: Show str => Tag str -> str

-- | Extract an attribute, crashes if not a <a>TagOpen</a>. Returns
--   <tt>""</tt> if no attribute present.
fromAttrib :: (Show str, Eq str, StringLike str) => str -> Tag str -> str

-- | Extract the string from within <a>TagText</a>, otherwise
--   <a>Nothing</a>
maybeTagText :: Tag str -> Maybe str

-- | Extract the string from within <a>TagWarning</a>, otherwise
--   <a>Nothing</a>
maybeTagWarning :: Tag str -> Maybe str

-- | Extract all text content from tags (similar to Verbatim found in
--   HaXml)
innerText :: StringLike str => [Tag str] -> str

-- | This function takes a list, and returns all suffixes whose first item
--   matches the predicate.
sections :: (a -> Bool) -> [a] -> [[a]]

-- | This function is similar to <a>sections</a>, but splits the list so no
--   element appears in any two partitions.
partitions :: (a -> Bool) -> [a] -> [[a]]

-- | Define a class to allow String's or Tag str's to be used as matches
class TagRep a
toTagRep :: (TagRep a, StringLike str) => a -> Tag str

-- | Performs an inexact match, the first item should be the thing to
--   match. If the second item is a blank string, that is considered to
--   match anything. For example:
--   
--   <pre>
--   (TagText "test" ~== TagText ""    ) == True
--   (TagText "test" ~== TagText "test") == True
--   (TagText "test" ~== TagText "soup") == False
--   </pre>
--   
--   For <a>TagOpen</a> missing attributes on the right are allowed.
(~==) :: (StringLike str, TagRep t) => Tag str -> t -> Bool

-- | Negation of <a>~==</a>
(~/=) :: (StringLike str, TagRep t) => Tag str -> t -> Bool
instance TagRep String
instance StringLike str => TagRep (Tag str)
