~alcinnz/amphiarao

8e1950465df016493e38832bdd3f229899475877 — Adrian Cochrane 3 years ago 9414d29
Parse web (or Gemini) pages & extract title.
6 files changed, 150 insertions(+), 7 deletions(-)

M amphiarao.cabal
M src/Internal.hs
A src/Internal/Elements.hs
M src/Internal/Load.hs
M src/Main.hs
M src/Webdriver.hs
M amphiarao.cabal => amphiarao.cabal +3 -2
@@ 54,7 54,8 @@ executable amphiarao
  main-is:             Main.hs
  
  -- Modules included in this executable, other than Main.
  other-modules:       Webdriver, Capabilities, JSON, Messages, Internal, Internal.Load,
  other-modules:       Webdriver, Capabilities, JSON, Messages,
        Internal, Internal.Load, Internal.Elements,
        UI.Templates, UI.Search
  
  -- LANGUAGE extensions used by modules in this package.


@@ 64,7 65,7 @@ executable amphiarao
  build-depends:       base >=4.9 && <4.10, happstack-lite >=7.3.7 && <7.4, happstack-server,
        aeson >= 1.5.0 && <1.6, text, bytestring, unordered-containers, vector,
        containers >=0.5 && <0.7, uuid >=1.3 && <1.4,
        blaze-html,
        blaze-html, xml-conduit >= 1.8 && < 1.9, html-conduit >= 1.3 && <1.4,
        hurl >= 2.1 && <3, network-uri
  
  -- Directories containing source files.

M src/Internal.hs => src/Internal.hs +14 -2
@@ 16,6 16,8 @@ import GHC.Generics

import qualified Network.URI as URI
import qualified Network.URI.Fetch as URI
import qualified Text.XML as XML
import qualified Data.Map as M'

type Sessions = MVar (M.HashMap UUID Session)
type Session = MVar Session'


@@ 25,7 27,8 @@ data Session' = Session {
    loader :: URI.Session,
    currentURL :: URI.URI,
    backStack :: [URI.URI],
    nextStack :: [URI.URI]
    nextStack :: [URI.URI],
    document :: XML.Document
  }

initSessions :: IO Sessions


@@ 42,7 45,16 @@ createSession sessions caps = do
            _ -> Timeouts Nothing Nothing Nothing,
        loader = loader',
        currentURL = URI.nullURI,
        backStack = [], nextStack = []
        backStack = [], nextStack = [],
        document = XML.Document {
            XML.documentPrologue = XML.Prologue [] Nothing [],
            XML.documentRoot = XML.Element {
                XML.elementName = "html",
                XML.elementAttributes = M'.empty,
                XML.elementNodes = []
            },
            XML.documentEpilogue = []
        }
      }
    session' <- newMVar session
    modifyMVar_ sessions (return . M.insert uuid session')

A src/Internal/Elements.hs => src/Internal/Elements.hs +19 -0
@@ 0,0 1,19 @@
{-# LANGUAGE OverloadedStrings #-}
module Internal.Elements (getTitle) where

import Text.XML
import qualified Data.Map as M
import Data.Text as Txt
import Control.Concurrent.MVar

import Internal

getTitle :: Session -> IO Text
getTitle session = getTitle' <$> documentRoot <$> document <$> readMVar session

getTitle' (Element "title" _ childs) = Txt.concat [txt | NodeContent txt <- childs]
getTitle' (Element "h1" _ childs) = Txt.concat [txt | NodeContent txt <- childs]
getTitle' (Element _ _ childs)
    -- FIXME: Caught Rhapsode bug repaired here, needs that filtering condition.
    | title:_ <- [getTitle' el | NodeElement el <- childs, getTitle' el /= ""] = title
    | otherwise = ""

M src/Internal/Load.hs => src/Internal/Load.hs +99 -2
@@ 1,3 1,4 @@
{-# LANGUAGE OverloadedStrings, PatternSynonyms, ViewPatterns #-}
module Internal.Load(load, load', back, next, parseAbsoluteURI) where

import Internal


@@ 8,19 9,26 @@ import Control.Monad.IO.Class

import Data.Aeson
import Data.Text (Text, pack)
import qualified Data.Text as Txt
import Data.Text.Lazy (fromStrict)
import GHC.Generics

import Data.Maybe (fromMaybe)
import qualified Data.ByteString.Lazy as B (toStrict)

import Network.URI as URI
import Network.URI.Fetch as URI
import Network.URI.Charset (convertCharset)
import qualified Text.HTML.DOM as HTML
import qualified Text.XML as XML
import qualified Data.Map as M

mime = words "text/html text/xml application/xml application/xhtml+xml text/plain"

load' :: Internal.Session -> URI -> IO ()
load' session uri = modifyMVar_ session $ \session' -> maybeTimeout session' uri $ do
    (redirected, _, _) <- fetchURL' (loader session') mime uri
    return $ session' { currentURL = redirected}
    resp@(redirected, _, _) <- fetchURL' (loader session') mime uri
    return $ session' { currentURL = redirected, document = parseDocument resp }

maybeTimeout :: Session' -> URI -> IO Session' -> IO Session'
maybeTimeout session uri act | currentURL session /= uri, Just delay <- pageLoad $ timeouts session =


@@ 53,3 61,92 @@ next session = do
  where
    inner session'@Session { backStack = bs, currentURL = b, nextStack = n:ns } =
        (session' { backStack = b:bs, nextStack = ns }, n)

---

parseDocument (uri, "html/x-error\t", resp) = parseDocument (uri, "text/html", resp)
parseDocument (_, "text/html", Left text) = HTML.parseLT $ fromStrict text
parseDocument (_, "text/html", Right bytes) = HTML.parseLBS bytes
parseDocument (uri, mime, resp) | mime /= mime' = parseDocument (uri, mime', resp)
    where mime' = takeWhile (/= ';') mime
parseDocument (_, _, Left text)
    | Right doc <- XML.parseText XML.def $ fromStrict text = doc
    | otherwise = pageForText text
parseDocument (_, _, Right bytes) | Right doc <- XML.parseLBS XML.def bytes = doc
parseDocument (_, 't':'e':'x':'t':'/':_, Right bytes) = pageForText $ utf8' bytes -- charset wasn't specified, so assume utf-8.
parseDocument (_, mime, Right _) = pageForText $ pack ('(':mime ++ " binary data)")

pageForText txt = XML.Document {
        XML.documentPrologue = XML.Prologue [] Nothing [],
        XML.documentRoot = XML.Element {
            XML.elementName = "pre",
            XML.elementAttributes = M.empty,
            XML.elementNodes = [XML.NodeContent txt]
        },
        XML.documentEpilogue = []
    }

utf8' bytes = convertCharset "utf-8" $ B.toStrict bytes

--------
---- Gemini implementation
--------
-- Copied from css-syntax.
pattern (:.) :: Char -> Txt.Text -> Txt.Text
pattern x :. xs <- (Txt.uncons -> Just (x, xs))

infixr 5 :.

el name text = XML.Element name M.empty [XML.NodeContent text]

parseGemini :: Maybe String -> Txt.Text -> XML.Document
parseGemini lang txt = XML.Document {
        XML.documentPrologue = XML.Prologue [] Nothing [],
        XML.documentRoot = XML.Element {
            XML.elementName = "body",
            XML.elementAttributes = M.fromList [
                ("lang", Txt.pack lang') | Just langs <- [lang], lang' <- [csv langs]],
            XML.elementNodes = map XML.NodeElement $ parseGemini' $ Txt.lines txt
        },
        XML.documentEpilogue = []
    }

csv (',':_) = ""
csv (c:rest) = c:csv rest
csv "" = ""

parseGemini' :: [Txt.Text] -> [XML.Element]
parseGemini' (('#':.'#':.'#' :. '#':.'#':.'#':.line):lines) =
    el "h6" line : parseGemini' lines
parseGemini' (('#':.'#':.'#' :. '#':.'#':.line):lines) =
    el "h5" line : parseGemini' lines
parseGemini' (('#':.'#':.'#' :. '#':.line):lines) =
    el "h4" line : parseGemini' lines
parseGemini' (('#':.'#':.'#':.line):lines) = el "h3" line : parseGemini' lines
parseGemini' (('#':.'#':.line):lines) = el "h2" line : parseGemini' lines
parseGemini' (('#':.line):lines) = el "h1" line : parseGemini' lines
-- Not properly structured, but still sounds fine...
parseGemini' (('*':.line):lines) = el "li" line : parseGemini' lines
parseGemini' (('>':.line):lines) = el "blockquote" line : parseGemini' lines

parseGemini' (('=':.'>':.line):lines)
    | (url:text@(_:_)) <- Txt.words line = (el "a" $ Txt.unwords text) {
            XML.elementAttributes = M.insert "href" url M.empty
        } : parseGemini' lines
    | otherwise = (el "a" $ Txt.strip line) {
            XML.elementAttributes = M.insert "href" (Txt.strip line) M.empty
        } : parseGemini' lines
parseGemini' (('`':.'`':.'`':.line):lines) = el "p" line : go lines
    where
        go (('`':.'`':.'`':._):lines) = parseGemini' lines
        go (_:lines) = go lines
        go [] = []
parseGemini' ("```":lines) = go [] lines
    where
        go texts (('`':.'`':.'`':._):lines) =
            el "pre" (Txt.unlines texts) : parseGemini' lines
        go texts (line:lines) = go (texts ++ [line]) lines
        go texts [] = []

parseGemini' (line:lines) = el "p" line : parseGemini' lines
parseGemini' [] = []

M src/Main.hs => src/Main.hs +5 -1
@@ 19,6 19,7 @@ import Happstack.Server.I18N

import Internal
import Internal.Load as Load
import Internal.Elements as El
import Control.Monad.IO.Class (liftIO)
import Control.Monad (forM)



@@ 87,7 88,10 @@ sessionHome session = do
    method GET

    session' <- liftIO $ readMVar session
    Tpl.inspector ok "UUID" session' $ \langs -> H.h1 $ string $ show $ currentURL session'
    title <- liftIO $ El.getTitle session
    Tpl.inspector ok "title" session' $ \langs -> do
        H.h1 $ text title
        H.p $ string $ show $ currentURL session'

session404 uuid = do
    Tpl.page notFound ["404", "Amphiarao"] $ \langs -> do

M src/Webdriver.hs => src/Webdriver.hs +10 -0
@@ 22,6 22,7 @@ import Capabilities (processCaps)
import JSON
import qualified Internal as WD
import qualified Internal.Load as WD
import qualified Internal.Elements as WDE

serveWebdriver :: WD.Sessions -> ServerPart Response
serveWebdriver sessions = do


@@ 41,6 42,7 @@ serveSession sessions = WD.withSession fail (\uuid session -> msum [
        dir "refresh" $ reloadPage session,
        dir "back" $ sessionAction WD.back session,
        dir "forward" $ sessionAction WD.next session,
        dir "title" $ sessionTitle session,
        dir "window" $ msum [ -- Noops
            getWindowHandle uuid,
            delSession sessions uuid, -- Closing the only window closes the session.


@@ 130,6 132,12 @@ sessionAction cb session = do
    liftIO $ cb session
    ok $ toResponse ()

sessionTitle session = do
    method GET
    nullDir
    ret <- liftIO $ WDE.getTitle session
    ok $ toResponse ret

---- Windowing noops
getWindowHandle uuid = do
    method GET


@@ 161,3 169,5 @@ noSuchFrame = do
unsupportedOp = do
    nullDir
    errJSON 400 "unsupported operation" "Windowsize is meaningless to Rhapsode"

----