From a59899ddb2ccbe23b8848fa2695fea074c325c44 Mon Sep 17 00:00:00 2001 From: Adrian Cochrane Date: Sun, 14 Jul 2019 13:41:48 +1200 Subject: [PATCH] Parse HTML, draft code to apply it. --- rhapsode.cabal | 3 +- src/Main.hs | 74 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 66 insertions(+), 11 deletions(-) diff --git a/rhapsode.cabal b/rhapsode.cabal index 4c3e2a9..3c62115 100644 --- a/rhapsode.cabal +++ b/rhapsode.cabal @@ -63,7 +63,8 @@ executable rhapsode build-depends: base >=4.9 && <4.10, http-client, http-client-tls, bytestring, html-conduit, xml-conduit, text, containers, - network-uri + network-uri, + stylish-haskell >= 0.2.0, css-syntax -- Directories containing source files. hs-source-dirs: src diff --git a/src/Main.hs b/src/Main.hs index ed0d9b8..a2f8975 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -7,11 +7,17 @@ import qualified Network.HTTP.Client as HTTP import Network.HTTP.Client.Internal import qualified Network.HTTP.Client.TLS as TLS import Network.URI +import qualified Data.ByteString.Lazy.Char8 as C8 + import qualified Text.HTML.DOM as HTML import qualified Text.XML as XML -import Data.Text (unpack) +import Data.Text as Txt (pack, unpack, Text(..), append) import qualified Data.Map as M -import qualified Data.ByteString.Lazy.Char8 as C8 + +import qualified Data.CSS.Syntax.StyleSheet as CSS +import qualified Data.CSS.Style as Style +import qualified Data.CSS.Syntax.Tokens as CSSTok +import qualified Data.List as L import DefaultCSS @@ -24,11 +30,21 @@ main = do response <- HTTP.httpLbs request manager let html = XML.documentRoot $ HTML.parseLBS $ HTTP.responseBody response style <- retreiveStyles html manager request - putStrLn $ unlines style + C8.putStrLn $ HTTP.responseBody response + + +cssPriorityAgent = 1 +cssPriorityUser = 2 +cssPriorityAuthor = 3 retreiveStyles html manager base = do css <- externalStyles html manager base - return $ userAgentCSS : css ++ internalStyles html + return $ authorStyle (css ++ internalStyles html) + where + emptyStyle :: Style.QueryableStyleSheet Style.TrivialPropertyParser + emptyStyle = Style.queryableStyleSheet {Style.priority = cssPriorityAgent} + agentStyle = CSS.parse emptyStyle $ Txt.pack userAgentCSS + authorStyle = foldl CSS.parse $ agentStyle {Style.priority = cssPriorityAuthor} externalStyles html manager base = go $ linkedStyles html where -- TODO parallelise loads @@ -36,14 +52,14 @@ externalStyles html manager base = go $ linkedStyles html request <- setUriRelative base link response <- HTTP.httpLbs request manager rest <- go links - return (C8.unpack (HTTP.responseBody response) : rest) + return (Txt.pack (C8.unpack $ HTTP.responseBody response) : rest) go [] = return [] linkedStyles (XML.Element "link" attrs _) | Just link <- "href" `M.lookup` attrs, Just "stylesheet" <- "rel" `M.lookup` attrs, testMedia attrs, - Just uri <- parseURIReference $ unpack link = [uri] + Just uri <- parseURIReference $ Txt.unpack link = [uri] linkedStyles (XML.Element _ _ children) = concat [linkedStyles el | XML.NodeElement el <- children] @@ -55,11 +71,49 @@ internalStyles (XML.Element _ _ children) = testMedia attrs = media == Nothing || media == Just "speech" where media = "media" `M.lookup` attrs - -strContent (XML.NodeContent text : rest) = unpack text ++ strContent rest +strContent :: [XML.Node] -> Txt.Text +strContent (XML.NodeContent text : rest) = text `Txt.append` strContent rest -- We do want to read in comments for CSS, just not for display. -strContent (XML.NodeComment text : rest) = unpack text ++ strContent rest +strContent (XML.NodeComment text : rest) = text `Txt.append` strContent rest strContent (XML.NodeElement (XML.Element _ _ children):rest) = - strContent children ++ strContent rest + strContent children `Txt.append` strContent rest strContent (_:rest) = strContent rest strContent [] = "" + + +stylizeEl = stylizeEl' Nothing Style.temp Nothing +stylizeEl' parent parentStyle previous stylesheet el@(XML.Element _ _ children) = + XML.Element { + XML.elementName = XML.Name "style" Nothing Nothing, + XML.elementAttributes = innerStyle, + XML.elementNodes = stylizeNodes (Just stylishEl) style Nothing stylesheet children + } where + style@(MPP innerStyle) = Style.cascade stylesheet stylishEl overrides parentStyle + stylishEl = elToStylish el parent previous + overrides = [] -- TODO parse style attribute +stylizeNodes up upStyle prev styles (XML.NodeContent txt:nodes) = + XML.NodeContent txt : stylizeNodes up upStyle prev styles nodes +stylizeNodes up upStyle prev styles (XML.NodeElement el:nodes) = + XML.NodeElement (stylizeEl' up upStyle prev styles el) : + stylizeNodes up upStyle stylishEl styles nodes + where stylishEl = Just $ elToStylish el up prev +stylizeNodes _ _ _ _ [] = [] + +elToStylish (XML.Element (XML.Name name _ _) attrs _) parent previous = + Style.ElementNode { + Style.name = name, + Style.attributes = L.sort [ + Style.Attribute (XML.nameLocalName name) (Txt.unpack value) + | (name, value) <- M.toList attrs + ], + Style.parent = parent, + Style.previous = previous + } + +data MapPropertyParser = MPP (M.Map XML.Name Text) +instance Style.PropertyParser MapPropertyParser where + temp = MPP M.empty + longhand _ (MPP self) name value = Just $ MPP $ M.insert xmlName xmlValue self + where + xmlName = XML.Name name Nothing Nothing + xmlValue = CSSTok.serialize value -- 2.30.2