From 8e1950465df016493e38832bdd3f229899475877 Mon Sep 17 00:00:00 2001 From: Adrian Cochrane Date: Thu, 24 Jun 2021 11:42:51 +1200 Subject: [PATCH] Parse web (or Gemini) pages & extract title. --- amphiarao.cabal | 5 +- src/Internal.hs | 16 ++++++- src/Internal/Elements.hs | 19 ++++++++ src/Internal/Load.hs | 101 ++++++++++++++++++++++++++++++++++++++- src/Main.hs | 6 ++- src/Webdriver.hs | 10 ++++ 6 files changed, 150 insertions(+), 7 deletions(-) create mode 100644 src/Internal/Elements.hs diff --git a/amphiarao.cabal b/amphiarao.cabal index 200d3d2..b274928 100644 --- a/amphiarao.cabal +++ b/amphiarao.cabal @@ -54,7 +54,8 @@ executable amphiarao main-is: Main.hs -- Modules included in this executable, other than Main. - other-modules: Webdriver, Capabilities, JSON, Messages, Internal, Internal.Load, + other-modules: Webdriver, Capabilities, JSON, Messages, + Internal, Internal.Load, Internal.Elements, UI.Templates, UI.Search -- LANGUAGE extensions used by modules in this package. @@ -64,7 +65,7 @@ executable amphiarao build-depends: base >=4.9 && <4.10, happstack-lite >=7.3.7 && <7.4, happstack-server, aeson >= 1.5.0 && <1.6, text, bytestring, unordered-containers, vector, containers >=0.5 && <0.7, uuid >=1.3 && <1.4, - blaze-html, + blaze-html, xml-conduit >= 1.8 && < 1.9, html-conduit >= 1.3 && <1.4, hurl >= 2.1 && <3, network-uri -- Directories containing source files. diff --git a/src/Internal.hs b/src/Internal.hs index a4f07a0..946a7ac 100644 --- a/src/Internal.hs +++ b/src/Internal.hs @@ -16,6 +16,8 @@ import GHC.Generics import qualified Network.URI as URI import qualified Network.URI.Fetch as URI +import qualified Text.XML as XML +import qualified Data.Map as M' type Sessions = MVar (M.HashMap UUID Session) type Session = MVar Session' @@ -25,7 +27,8 @@ data Session' = Session { loader :: URI.Session, currentURL :: URI.URI, backStack :: [URI.URI], - nextStack :: [URI.URI] + nextStack :: [URI.URI], + document :: XML.Document } initSessions :: IO Sessions @@ -42,7 +45,16 @@ createSession sessions caps = do _ -> Timeouts Nothing Nothing Nothing, loader = loader', currentURL = URI.nullURI, - backStack = [], nextStack = [] + backStack = [], nextStack = [], + document = XML.Document { + XML.documentPrologue = XML.Prologue [] Nothing [], + XML.documentRoot = XML.Element { + XML.elementName = "html", + XML.elementAttributes = M'.empty, + XML.elementNodes = [] + }, + XML.documentEpilogue = [] + } } session' <- newMVar session modifyMVar_ sessions (return . M.insert uuid session') diff --git a/src/Internal/Elements.hs b/src/Internal/Elements.hs new file mode 100644 index 0000000..5435ead --- /dev/null +++ b/src/Internal/Elements.hs @@ -0,0 +1,19 @@ +{-# LANGUAGE OverloadedStrings #-} +module Internal.Elements (getTitle) where + +import Text.XML +import qualified Data.Map as M +import Data.Text as Txt +import Control.Concurrent.MVar + +import Internal + +getTitle :: Session -> IO Text +getTitle session = getTitle' <$> documentRoot <$> document <$> readMVar session + +getTitle' (Element "title" _ childs) = Txt.concat [txt | NodeContent txt <- childs] +getTitle' (Element "h1" _ childs) = Txt.concat [txt | NodeContent txt <- childs] +getTitle' (Element _ _ childs) + -- FIXME: Caught Rhapsode bug repaired here, needs that filtering condition. + | title:_ <- [getTitle' el | NodeElement el <- childs, getTitle' el /= ""] = title + | otherwise = "" diff --git a/src/Internal/Load.hs b/src/Internal/Load.hs index 05c8ac4..c04d027 100644 --- a/src/Internal/Load.hs +++ b/src/Internal/Load.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings, PatternSynonyms, ViewPatterns #-} module Internal.Load(load, load', back, next, parseAbsoluteURI) where import Internal @@ -8,19 +9,26 @@ import Control.Monad.IO.Class import Data.Aeson import Data.Text (Text, pack) +import qualified Data.Text as Txt +import Data.Text.Lazy (fromStrict) import GHC.Generics import Data.Maybe (fromMaybe) +import qualified Data.ByteString.Lazy as B (toStrict) import Network.URI as URI import Network.URI.Fetch as URI +import Network.URI.Charset (convertCharset) +import qualified Text.HTML.DOM as HTML +import qualified Text.XML as XML +import qualified Data.Map as M mime = words "text/html text/xml application/xml application/xhtml+xml text/plain" load' :: Internal.Session -> URI -> IO () load' session uri = modifyMVar_ session $ \session' -> maybeTimeout session' uri $ do - (redirected, _, _) <- fetchURL' (loader session') mime uri - return $ session' { currentURL = redirected} + resp@(redirected, _, _) <- fetchURL' (loader session') mime uri + return $ session' { currentURL = redirected, document = parseDocument resp } maybeTimeout :: Session' -> URI -> IO Session' -> IO Session' maybeTimeout session uri act | currentURL session /= uri, Just delay <- pageLoad $ timeouts session = @@ -53,3 +61,92 @@ next session = do where inner session'@Session { backStack = bs, currentURL = b, nextStack = n:ns } = (session' { backStack = b:bs, nextStack = ns }, n) + +--- + +parseDocument (uri, "html/x-error\t", resp) = parseDocument (uri, "text/html", resp) +parseDocument (_, "text/html", Left text) = HTML.parseLT $ fromStrict text +parseDocument (_, "text/html", Right bytes) = HTML.parseLBS bytes +parseDocument (uri, mime, resp) | mime /= mime' = parseDocument (uri, mime', resp) + where mime' = takeWhile (/= ';') mime +parseDocument (_, _, Left text) + | Right doc <- XML.parseText XML.def $ fromStrict text = doc + | otherwise = pageForText text +parseDocument (_, _, Right bytes) | Right doc <- XML.parseLBS XML.def bytes = doc +parseDocument (_, 't':'e':'x':'t':'/':_, Right bytes) = pageForText $ utf8' bytes -- charset wasn't specified, so assume utf-8. +parseDocument (_, mime, Right _) = pageForText $ pack ('(':mime ++ " binary data)") + +pageForText txt = XML.Document { + XML.documentPrologue = XML.Prologue [] Nothing [], + XML.documentRoot = XML.Element { + XML.elementName = "pre", + XML.elementAttributes = M.empty, + XML.elementNodes = [XML.NodeContent txt] + }, + XML.documentEpilogue = [] + } + +utf8' bytes = convertCharset "utf-8" $ B.toStrict bytes + +-------- +---- Gemini implementation +-------- +-- Copied from css-syntax. +pattern (:.) :: Char -> Txt.Text -> Txt.Text +pattern x :. xs <- (Txt.uncons -> Just (x, xs)) + +infixr 5 :. + +el name text = XML.Element name M.empty [XML.NodeContent text] + +parseGemini :: Maybe String -> Txt.Text -> XML.Document +parseGemini lang txt = XML.Document { + XML.documentPrologue = XML.Prologue [] Nothing [], + XML.documentRoot = XML.Element { + XML.elementName = "body", + XML.elementAttributes = M.fromList [ + ("lang", Txt.pack lang') | Just langs <- [lang], lang' <- [csv langs]], + XML.elementNodes = map XML.NodeElement $ parseGemini' $ Txt.lines txt + }, + XML.documentEpilogue = [] + } + +csv (',':_) = "" +csv (c:rest) = c:csv rest +csv "" = "" + +parseGemini' :: [Txt.Text] -> [XML.Element] +parseGemini' (('#':.'#':.'#' :. '#':.'#':.'#':.line):lines) = + el "h6" line : parseGemini' lines +parseGemini' (('#':.'#':.'#' :. '#':.'#':.line):lines) = + el "h5" line : parseGemini' lines +parseGemini' (('#':.'#':.'#' :. '#':.line):lines) = + el "h4" line : parseGemini' lines +parseGemini' (('#':.'#':.'#':.line):lines) = el "h3" line : parseGemini' lines +parseGemini' (('#':.'#':.line):lines) = el "h2" line : parseGemini' lines +parseGemini' (('#':.line):lines) = el "h1" line : parseGemini' lines +-- Not properly structured, but still sounds fine... +parseGemini' (('*':.line):lines) = el "li" line : parseGemini' lines +parseGemini' (('>':.line):lines) = el "blockquote" line : parseGemini' lines + +parseGemini' (('=':.'>':.line):lines) + | (url:text@(_:_)) <- Txt.words line = (el "a" $ Txt.unwords text) { + XML.elementAttributes = M.insert "href" url M.empty + } : parseGemini' lines + | otherwise = (el "a" $ Txt.strip line) { + XML.elementAttributes = M.insert "href" (Txt.strip line) M.empty + } : parseGemini' lines +parseGemini' (('`':.'`':.'`':.line):lines) = el "p" line : go lines + where + go (('`':.'`':.'`':._):lines) = parseGemini' lines + go (_:lines) = go lines + go [] = [] +parseGemini' ("```":lines) = go [] lines + where + go texts (('`':.'`':.'`':._):lines) = + el "pre" (Txt.unlines texts) : parseGemini' lines + go texts (line:lines) = go (texts ++ [line]) lines + go texts [] = [] + +parseGemini' (line:lines) = el "p" line : parseGemini' lines +parseGemini' [] = [] diff --git a/src/Main.hs b/src/Main.hs index 8a17aee..a79024f 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -19,6 +19,7 @@ import Happstack.Server.I18N import Internal import Internal.Load as Load +import Internal.Elements as El import Control.Monad.IO.Class (liftIO) import Control.Monad (forM) @@ -87,7 +88,10 @@ sessionHome session = do method GET session' <- liftIO $ readMVar session - Tpl.inspector ok "UUID" session' $ \langs -> H.h1 $ string $ show $ currentURL session' + title <- liftIO $ El.getTitle session + Tpl.inspector ok "title" session' $ \langs -> do + H.h1 $ text title + H.p $ string $ show $ currentURL session' session404 uuid = do Tpl.page notFound ["404", "Amphiarao"] $ \langs -> do diff --git a/src/Webdriver.hs b/src/Webdriver.hs index cff9b71..e1500ca 100644 --- a/src/Webdriver.hs +++ b/src/Webdriver.hs @@ -22,6 +22,7 @@ import Capabilities (processCaps) import JSON import qualified Internal as WD import qualified Internal.Load as WD +import qualified Internal.Elements as WDE serveWebdriver :: WD.Sessions -> ServerPart Response serveWebdriver sessions = do @@ -41,6 +42,7 @@ serveSession sessions = WD.withSession fail (\uuid session -> msum [ dir "refresh" $ reloadPage session, dir "back" $ sessionAction WD.back session, dir "forward" $ sessionAction WD.next session, + dir "title" $ sessionTitle session, dir "window" $ msum [ -- Noops getWindowHandle uuid, delSession sessions uuid, -- Closing the only window closes the session. @@ -130,6 +132,12 @@ sessionAction cb session = do liftIO $ cb session ok $ toResponse () +sessionTitle session = do + method GET + nullDir + ret <- liftIO $ WDE.getTitle session + ok $ toResponse ret + ---- Windowing noops getWindowHandle uuid = do method GET @@ -161,3 +169,5 @@ noSuchFrame = do unsupportedOp = do nullDir errJSON 400 "unsupported operation" "Windowsize is meaningless to Rhapsode" + +---- -- 2.30.2