Commit 2d3960fd authored by Bjørnar Hansen's avatar Bjørnar Hansen Committed by Bjørnar Hansen

Open source release.

parents
dist/
*.hi
*.o
import Text.Html.Html5.Parser
import System.Environment
main = putStrLn . renderWithWarnings . parseTree [withWarnings] =<< readFile =<< head `fmap` getArgs
module Html5Parser.Test where
-- Stolen from the excellent tagsoup library.
--
-- With small modifications as this library parses the doctypes exactly.
import Text.Html.Html5.Soup
type Test a = IO a
pass :: Test ()
pass = return ()
runTest :: Test () -> IO ()
runTest x = x >> putStrLn "All tests passed"
(===) :: (Show a, Eq a) => a -> a -> IO ()
a === b = if a == b then pass else fail $ "Does not equal: " ++ show a ++ " =/= " ++ show b
main = runTest parseTests
parseTests :: Test ()
parseTests = do
parseSoup "<!DOCTYPE TEST>" === [TagDoctype (Just "test") Nothing Nothing]
parseSoup "<!DOCTYPE html PUBLIC\"a\"'a'>" === [TagDoctype (Just "html") (Just "a") (Just "a")]
parseSoup "<!DOCTYPE html PUBLIC'a'\"a\">" === [TagDoctype (Just "html") (Just "a") (Just "a")]
parseSoup "<!DOCTYPE html PUBLIC'a''a'>" === [TagDoctype (Just "html") (Just "a") (Just "a")]
parseSoup "<!DOCTYPE html PUBLIC\"a\"\"a\">" === [TagDoctype (Just "html") (Just "a") (Just "a")]
parseSoup "<!DOCTYPE html SYSTEM \"a\">" === [TagDoctype (Just "html") Nothing (Just "a")]
parseSoup "<!DOCTYPE html SYSTEM 'a'>" === [TagDoctype (Just "html") Nothing (Just "a")]
parseSoup "<!DOCTYPE html SYSTEM 'a' hello>" === [TagDoctype (Just "html") Nothing (Just "a")]
parseSoup "<test \"foo bar\">" === [TagOpen "test" [("\"foo",""),("bar\"","")]]
parseSoup "<test baz \"foo\">" === [TagOpen "test" [("baz",""),("\"foo\"","")]]
parseSoup "<test 'foo bar'>" === [TagOpen "test" [("'foo",""),("bar'","")]]
parseSoup "<test bar=''' />" === [TagOpen "test" [("bar",""),("'","")], TagClose "test"]
parseSoup "<test2 a b>" === [TagOpen "test2" [("a",""),("b","")]]
parseSoup "<test2 ''>" === [TagOpen "test2" [("''","")]]
parseSoup "</test foo>" === [TagClose "test"]
parseSoup "<test/>" === [TagOpen "test" [], TagClose "test"]
parseSoup "<test1 a = b>" === [TagOpen "test1" [("a","b")]]
parseSoup "hello &amp; world" === [TagText "hello & world"]
parseSoup "hello &#64; world" === [TagText "hello @ world"]
parseSoup "hello &#x40; world" === [TagText "hello @ world"]
parseSoup "hello &haskell; world" === [TagText "hello &haskell; world"]
parseSoup "hello \n\t world" === [TagText "hello \n\t world"]
parseSoup "<a href=http://www.google.com>" === [TagOpen "a" [("href","http://www.google.com")]]
parseSoup "<foo bar=\"bar&#54;baz\">" === [TagOpen "foo" [("bar","bar6baz")]]
parseSoup "<foo bar=\"bar&amp;baz\">" === [TagOpen "foo" [("bar","bar&baz")]]
parseSoup "hey &how are you" === [TagText "hey &how are you"]
parseSoup "hey &how; are you" === [TagText "hey &how; are you"]
parseSoup "hey &amp are you" === [TagText "hey & are you"]
parseSoup "hey &amp; are you" === [TagText "hey & are you"]
parseSoup "<!-- -\0 -->" === [TagComment " -\xFFFD "]
-- Lower-case attributes and tags.
parseSoup "<META HTTP-EQUIV=\"content-type\" CONTENT=\"text/html; charset=UTF-8\">"
=== [TagOpen "meta" [ ("http-equiv","content-type")
, ("content","text/html; charset=UTF-8")]]
-- real cases reported by users
parseSoup "<a href=\"series.php?view=single&ID=72710\">" === [TagOpen "a" [("href","series.php?view=single&ID=72710")]]
parseSoup "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">" ===
[TagDoctype {tagDocName = Just "html", tagPubId = Just "-//W3C//DTD HTML 4.01//EN", tagSysId = Just "http://www.w3.org/TR/html4/strict.dtd"}]
parseSoup "<script src=\"http://edge.jobthread.com/feeds/jobroll/?s_user_id=100540&subtype=slashdot\">" ===
[TagOpen "script" [("src","http://edge.jobthread.com/feeds/jobroll/?s_user_id=100540&subtype=slashdot")]]
parseSoup "<a title='foo'bar' href=correct>text" === [TagOpen "a" [("title","foo"),("bar'",""),("href", "correct")],TagText "text"]
parseSoup "<test><![CDATA[Anything goes, <em>even hidden markup</em> &amp; entities]]> but this is outside</test>" ===
[TagOpen "test" [],TagText "Anything goes, <em>even hidden markup</em> &amp; entities but this is outside",TagClose "test"]
parseSoup "<a href='random.php'><img src='strips/130307.jpg' alt='nukular bish'' title='' /></a>" ===
[TagOpen "a" [("href","random.php")],TagOpen "img" [("src","strips/130307.jpg"),("alt","nukular bish"),("'",""),("title","")],TagClose "img",TagClose "a"]
parseSoup "<p>some text</p\n<img alt='&lt; &yyy; &gt;' src=\"abc.gif\">" ===
[TagOpen "p" [],TagText "some text",TagClose "p"]
parseSoup "<foo bar=\"bar&amp=baz\">" === [TagOpen "foo" [("bar","bar&amp=baz")]]
-- \r removed in unimplemented preprocess-stage.
parseSoup "\r \r\n \n" === [TagText "\n \n \n"]
parseSoup "<a \r\n href=\"url\">" === [TagOpen "a" [("href","url")]]
{-# LANGUAGE UnicodeSyntax #-}
module Html5Parser.TreeTests where
import Text.Html.Html5.Parser.Tree
import Text.Html.Html5.Parser.Types
(===) String String IO ()
a === b = if test a == b then return () else fail ("test " ++ a ++ " != " ++ b ++ "\nWas: " ++ test a)
main = do
"<!DOCTYPE html>hello" === "<!DOCTYPE html><html><head /><body>hello</body></html>"
"<!DOCTYPE html>hello<body a=a>" === "<!DOCTYPE html><html><head /><body a=\"a\">hello<!-- WARNING 1,21: unexpected <body> in body --></body></html>"
putStrLn "All passed."
Copyright (c) 2015, Tingtun AS, http://tingtun.no
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# html5parser
import Distribution.Simple
main = defaultMain
{-# LANGUAGE UnicodeSyntax #-}
module Text.Html.Html5.Parser
( parseTree, parseInBody
, State(..), Node(..), Str(..)
, render, renderDocument, renderElement, renderAttributes
, renderWithWarnings
, renderStartTag
, withWarnings, withoutWarnings
, Position(..), Configuration, Document(..)
, ParseError (..), ErrorType (..)
, Attributes, Attribute
, parse
) where
import Data.Foldable
import Text.Html.Html5.Parser.Tokenizer (tData)
import Text.Html.Html5.Parser.Types
import Text.Html.Html5.Parser.Tree
type Configuration α = State α State α
-- | Parse a string into a tree.
parseTree Str α [Configuration α] α Node α
parseTree conf = parse conf . preprocess
-- | Parse a fragment in body context.
parseInBody Str α [Configuration α] α Node α
parseInBody conf = parse' conf (inBody True) tData . preprocess
-- | Include warnings in the document.
withWarnings Configuration α
withWarnings x = x { stIncludeWarnings_ = True }
-- | Do not include warnings in the document (default).
withoutWarnings Configuration α
withoutWarnings x = x { stIncludeWarnings_ = False }
renderDocument Str α Node α String
renderDocument (Element _ _ _ xs) = toString . fold $ fmap render xs
renderDocument x = toString $ render x
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE UnicodeSyntax #-}
{-# LANGUAGE RecordWildCards #-}
module Text.Html.Html5.Parser.Doctypes where
import Prelude.Unicode
import Text.Html.Html5.Parser.Types
-- | 13.2.5.4, the valid doctypes.
valid IsString α [(Maybe α, Maybe α, Maybe α)]
valid = [ (Just "html", Nothing, Nothing)
, (Just "html", Nothing, Just "about:legacy-compat")
, (Just "html", Just "-//W3C//DTD HTML 4.0//EN"
, Just "http://www.w3.org/TR/REC-html40/strict.dtd")
, (Just "html", Just "-//W3C//DTD HTML 4.01//EN"
, Just "http://www.w3.org/TR/html4/strict.dtd")
, (Just "html", Just "-//W3C//DTD XHTML 1.0 Strict//EN"
, Just "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd")
, (Just "html", Just "-//W3C//DTD XHTML 1.1//EN"
, Just "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd")
]
-- | 13.2.5.4, check if a Doctype token sets the document in limited quirks mode.
-- Only true if 'isQuirks' returns False.
isLimitedQuirks Str α Token α Bool
isLimitedQuirks TokDoctype {..} =
isJust tokPubId
( f "-//W3C//DTD XHTML 1.0 Frameset//"
f "-//W3C//DTD XHTML 1.0 Transitional//"
( isJust tokSysId
( f "-//W3C//DTD HTML 4.01 Frameset//"
f "-//W3C//DTD HTML 4.01 Transitional//")))
where x = toLower $ fromJust tokPubId
f = (`isPrefixOf` x)
isLimitedQuirks _ = error "isLimitedQuirks: only works on doctypes."
-- | 13.2.5.4, check of a Doctype token sets the document in quirks mode.
isQuirks Str α Token α Bool
isQuirks TokDoctype {..} =
tokForceQuirks
tokDocName Just "html"
(isJust tokPubId
let x = toLower $ fromJust tokPubId
f = (`isPrefixOf` x)
in x "-/w3c/dtd html 4.0 transitional/en"
x "html"
x "-//w3o//dtd w3 html strict 3.0//en//"
f "+//silmaril//dtd html pro v0r11 19970101//"
f "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
f "-//as//dtd html 3.0 aswedit + extensions//"
f "-//ietf//dtd html 2.0 level 1//"
f "-//ietf//dtd html 2.0 level 2//"
f "-//ietf//dtd html 2.0 strict level 1//"
f "-//ietf//dtd html 2.0 strict level 2//"
f "-//ietf//dtd html 2.0 strict//"
f "-//ietf//dtd html 2.0//"
f "-//ietf//dtd html 2.1e//"
f "-//ietf//dtd html 3.0//"
f "-//ietf//dtd html 3.2 final//"
f "-//ietf//dtd html 3.2//"
f "-//ietf//dtd html 3//"
f "-//ietf//dtd html level 0//"
f "-//ietf//dtd html level 1//"
f "-//ietf//dtd html level 2//"
f "-//ietf//dtd html level 3//"
f "-//ietf//dtd html strict level 0//"
f "-//ietf//dtd html strict level 1//"
f "-//ietf//dtd html strict level 2//"
f "-//ietf//dtd html strict level 3//"
f "-//ietf//dtd html strict//"
f "-//ietf//dtd html//"
f "-//metrius//dtd metrius presentational//"
f "-//microsoft//dtd internet explorer 2.0 html strict//"
f "-//microsoft//dtd internet explorer 2.0 html//"
f "-//microsoft//dtd internet explorer 2.0 tables//"
f "-//microsoft//dtd internet explorer 3.0 html strict//"
f "-//microsoft//dtd internet explorer 3.0 html//"
f "-//microsoft//dtd internet explorer 3.0 tables//"
f "-//netscape comm. corp.//dtd html//"
f "-//netscape comm. corp.//dtd strict html//"
f "-//o'reilly and associates//dtd html 2.0//"
f "-//o'reilly and associates//dtd html extended 1.0//"
f "-//o'reilly and associates//dtd html extended relaxed 1.0//"
f "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
f "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
f "-//spyglass//dtd html 2.0 extended//"
f "-//sq//dtd html 2.0 hotmetal + extensions//"
f "-//sun microsystems corp.//dtd hotjava html//"
f "-//sun microsystems corp.//dtd hotjava strict html//"
f "-//w3c//dtd html 3 1995-03-24//"
f "-//w3c//dtd html 3.2 draft//"
f "-//w3c//dtd html 3.2 final//"
f "-//w3c//dtd html 3.2//"
f "-//w3c//dtd html 3.2s draft//"
f "-//w3c//dtd html 4.0 frameset//"
f "-//w3c//dtd html 4.0 transitional//"
f "-//w3c//dtd html experimental 19960712//"
f "-//w3c//dtd html experimental 970421//"
f "-//w3c//dtd w3 html//"
f "-//w3o//dtd w3 html 3.0//"
f "-//webtechs//dtd mozilla html 2.0//"
f "-//webtechs//dtd mozilla html//")
( isJust tokSysId
(toLower $ fromJust tokSysId) "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"
)
( isNothing tokSysId isJust tokPubId
let x = toLower $ fromJust tokPubId
f = (`isPrefixOf` x)
in f "-//W3C//DTD HTML 4.01 Frameset//"
f "-//W3C//DTD HTML 4.01 Transitional//"
)
isQuirks _ = error "isQuirks can only be performed on a Doctype token."
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE UnicodeSyntax #-}
module Text.Html.Html5.Soup where
import Prelude.Unicode
import Text.Html.Html5.Parser.Types (Str, Int64, ())
import qualified Text.Html.Html5.Parser.Types as Ty
import qualified Text.Html.Html5.Parser.Tokenizer as T
type Column = Int64
type Line = Int64
data Tag α = TagDoctype { tagDocName, tagPubId, tagSysId Maybe α }
| TagOpen α [(α, α)]
| TagClose α
| TagText α
| TagComment α
| TagWarning α
| TagPosition !Line !Column
deriving (Eq, Ord, Show)
tokenToSoup Ty.Str α Ty.Token α [Tag α]
tokenToSoup Ty.TokDoctype {..} = [TagDoctype tokDocName tokPubId tokSysId]
tokenToSoup Ty.TokTag {..} | tokIsEndTag False =
[TagOpen tokName tokAttributes] ++ if tokSelfClosing then [TagClose tokName] else []
tokenToSoup Ty.TokTag {..} | tokIsEndTag True = [TagClose tokName]
tokenToSoup Ty.TokComment {..} = [TagComment tokData]
tokenToSoup Ty.TokWarning {..} = [TagWarning . Ty.fromString $ show tokWarning]
tokenToSoup Ty.TokCharacter {..} = [TagText tokData]
tokenToSoup Ty.TokEOF {} = []
dropWarnings [Tag α] [Tag α]
dropWarnings (TagWarning {}:xs) = dropWarnings xs
dropWarnings (x:xs) = x:dropWarnings xs
dropWarnings _ = []
concatTexts Str α [Tag α] [Tag α]
concatTexts (TagText x:TagText y:xs) = concatTexts (TagText (x y):xs)
concatTexts (x:xs) = x:concatTexts xs
concatTexts _ = []
parseSoup Str α α [Tag α]
parseSoup = concatTexts . dropWarnings . concat . map tokenToSoup
. T.pogoStick T.tData . Ty.stdTSt
. Ty.preprocess
name: html5parser
version: 0.1
cabal-version: >=1.2
build-type: Simple
license: BSD3
license-file: LICENSE
synopsis: A HTML5-compliant parser.
category: Web
author: Various
library
build-depends:
bytestring-trie ==0.2.*,
data-accessor-transformers ==0.2.*,
base ==4.*,
base-unicode-symbols ==0.2.*,
data-accessor ==0.2.*,
data-accessor-template ==0.2.*,
transformers >=0.2 && <=0.5,
utility-ht >=0.0.7,
bytestring >=0.9,
data-default >=0.3
exposed-modules:
Text.Html.Html5.Parser
Text.Html.Html5.Soup
other-modules:
Text.Html.Html5.Parser.Types
Text.Html.Html5.Parser.Tree
Text.Html.Html5.Parser.NamedEntities
Text.Html.Html5.Parser.Tokenizer
Text.Html.Html5.Parser.Doctypes
ghc-options: -O2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment