~alcinnz/hurl

f605efd1872c427894e67aa40893cad9ce8ce577 — Adrian Cochrane 2 years ago 5321296
Release v0.1 decoupled from Rhapsode, with CSV/TSV & refined tablesorting!
3 files changed, 175 insertions(+), 56 deletions(-)

M hurl-xml/hurl-xml.cabal
M hurl-xml/src/Network/MIME/XML.hs
M hurl-xml/src/Network/MIME/XML/Table.hs
M hurl-xml/hurl-xml.cabal => hurl-xml/hurl-xml.cabal +3 -3
@@ 16,7 16,7 @@ version:             0.1.0.0
synopsis:            Fetch parsed XML & possibly CSS for a URL based on MIMEtype.

-- A longer description of the package.
description:         API bindings between HURL, XML-Conduit, & Haskell Stylist.
description:         API bindings between HURL, XML-Conduit, & Haskell Stylist. Supports HTML, XML, Gemini, TSV, CSV, or plaintext files. Can automatically extract & apply CSS stylesheets if Haskell Stylist is used, in which case CSS files cause the previous page to get restyled.

-- URL for the project homepage or repository.
homepage:            https://rhapsode.adrian.geek.nz/


@@ 63,8 63,8 @@ library
  build-depends:       base >=4.12 && <4.13, text, bytestring, containers,
                       data-default-class,
                       time, directory, filepath, temporary,
                       xml-conduit, html-conduit,
                       network-uri, hurl, file-embed,
                       xml-conduit >= 1.8 && <2, html-conduit >= 1.3 && <2,
                       network-uri, hurl >= 2.2 && <3, file-embed >= 0.0.15 && <0.1,
                       css-syntax, stylist-traits, xml-conduit-stylist >= 3 && <4

  -- Directories containing source files.

M hurl-xml/src/Network/MIME/XML.hs => hurl-xml/src/Network/MIME/XML.hs +108 -47
@@ 50,12 50,13 @@ data Page styles = Page {
    backStack :: [(String, URI)],
    forwardStack :: [(String, URI)],
    -- Probably don't need an MVar here, but let's be safe!
    visitedURLs :: Set Text
    visitedURLs :: Set Text,
    appName :: String
}

loadVisited :: IO (Set Text)
loadVisited = do
    dir <- getXdgDirectory XdgData "rhapsode"
loadVisited :: String -> IO (Set Text)
loadVisited appname = do
    dir <- getXdgDirectory XdgData appname
    let path = dir </> "history.gmni"
    exists <- doesFileExist path



@@ 68,33 69,35 @@ loadVisited = do
readStrict path = do s <- Prelude.readFile path; length s `seq` return s

utf8' bytes = convertCharset "utf-8" $ B.toStrict bytes
aCCEPT = ["text/xml", "application/xml", "text/html", "text/gemini",
    "text/csv", "text/tab-separated-values", "text/css", "text/*", "*/*"]

fetchDocument http referer mime URI { uriScheme = "action:", uriPath = "nocache" } =
    fetchDocument http { cachingEnabled = False } referer mime $ pageURL referer
fetchDocument http referer mime URI { uriScheme = "action:", uriPath = "novalidate" } =
    fetchDocument http { validateCertificates = False } referer mime $ pageURL referer
fetchDocument http referer mime URI { uriScheme = "action:", uriPath = "history/back" } =
        fetchURL' http mime (pageURL referer') >>= parseDocument' referer' http False
fetchDocument http referer URI { uriScheme = "action:", uriPath = "nocache" } =
    fetchDocument http { cachingEnabled = False } referer $ pageURL referer
fetchDocument http referer URI { uriScheme = "action:", uriPath = "novalidate" } =
    fetchDocument http { validateCertificates = False } referer $ pageURL referer
fetchDocument http referer URI { uriScheme = "action:", uriPath = "history/back" } =
        fetchURL' http aCCEPT (pageURL referer') >>= parseDocument' referer' http False
    where referer' = shiftHistory referer (-1)
fetchDocument http referer mime URI { uriScheme = "action:", uriPath = "history/forward" } =
        fetchURL' http mime (pageURL referer') >>= parseDocument' referer' http False
fetchDocument http referer URI { uriScheme = "action:", uriPath = "history/forward" } =
        fetchURL' http aCCEPT (pageURL referer') >>= parseDocument' referer' http False
    where referer' = shiftHistory referer 1
fetchDocument http referer mime URI {
fetchDocument http referer URI {
        uriScheme = "action:", uriPath = 'h':'i':'s':'t':'o':'r':'y':'/':x
    } | Just x' <- readMaybe x, referer' <- shiftHistory referer x' =
        fetchURL' http mime (pageURL referer') >>= parseDocument' referer http False
fetchDocument http referer mime URI { uriScheme = "app:", uriPath = appID } = do
        fetchURL' http aCCEPT (pageURL referer') >>= parseDocument' referer http False
fetchDocument http referer URI { uriScheme = "app:", uriPath = appID } = do
    dispatchByApp http Application {
        name = "", icon = nullURI, description = "",
        appId = appID
      } (pageMIME referer) $ pageURL referer
    return referer -- TODO play an error or success sound
fetchDocument http referer@Page { pageURL = uri0 } mime uri@URI { uriFragment = anchor }
fetchDocument http referer@Page { pageURL = uri0 } uri@URI { uriFragment = anchor }
    | uri { uriFragment = "" } == uri0 { uriFragment = "" } = return referer {
        html = applySortDoc anchor $ html referer,
        pageURL = uri
    }
fetchDocument http referer mime uri = fetchURL' http mime uri >>= parseDocument' referer http True
fetchDocument http referer uri = fetchURL' http aCCEPT uri >>= parseDocument' referer http True

shiftHistory :: Page style -> Integer -> Page style
shiftHistory self 0 = self


@@ 129,26 132,27 @@ parseDocument' ref@Page {visitedURLs = hist} sess saveHist resp@(URI {uriFragmen
parseDocument :: StyleSheet s => Page s -> Session -> (URI, String, Either Text B.ByteString)
        -> IO (Page s)
parseDocument ref sess (uri, "html/x-error\t", resp) = parseDocument ref sess (uri, "text/html", resp)
parseDocument Page {initCSS = css'} _ (uri, "text/html", Left text) =
    pageForDoc css' uri $ HTML.parseLT $ fromStrict text
parseDocument Page {initCSS = css'} _(uri, "text/html", Right bytes) =
    pageForDoc css' uri $ HTML.parseLBS bytes
parseDocument Page {initCSS = css'} _
parseDocument Page {initCSS = css', appName = name} _ (uri, "text/html", Left text) =
    pageForDoc css' name uri $ HTML.parseLT $ fromStrict text
parseDocument Page {initCSS = css', appName = name} _(uri, "text/html", Right bytes) =
    pageForDoc css' name uri $ HTML.parseLBS bytes
parseDocument Page {initCSS = css', appName = name} _
        (uri, 't':'e':'x':'t':'/':'g':'e':'m':'i':'n':'i':';':'l':'a':'n':'g':'=':lang, Left text) =
    pageForDoc css' uri $ parseGemini (Just lang) text
parseDocument Page {initCSS = css'} _
    pageForDoc css' name uri $ parseGemini (Just lang) text
parseDocument Page {initCSS = css', appName = name} _
        (uri, 't':'e':'x':'t':'/':'g':'e':'m':'i':'n':'i':';':'l':'a':'n':'g':'=':lang, Right bytes) =
    pageForDoc css' uri $ parseGemini (Just lang) $ utf8' bytes
parseDocument Page {initCSS = css'} _ (uri, "text/gemini", Left text) =
    pageForDoc css' uri $ parseGemini Nothing text
parseDocument Page {initCSS = css'} _ (uri, "text/gemini", Right bytes) =
    pageForDoc css' uri $ parseGemini Nothing $ utf8' bytes
    pageForDoc css' name uri $ parseGemini (Just lang) $ utf8' bytes
parseDocument Page {initCSS = css', appName = name} _ (uri, "text/gemini", Left text) =
    pageForDoc css' name uri $ parseGemini Nothing text
parseDocument Page {initCSS = css', appName = name} _ (uri, "text/gemini", Right bytes) =
    pageForDoc css' name uri $ parseGemini Nothing $ utf8' bytes
parseDocument a b (a', b'@"text/css", Right bytes) =
    parseDocument a b (a', b', Left $ applyCSScharset (map Txt.unpack charsets) $ B.toStrict bytes)
parseDocument referer@Page {pageURL = uri', initCSS = css'} _ (uri, "text/css", Left text)
parseDocument referer@Page {pageURL = uri', initCSS = css', appName = name} _
    (uri, "text/css", Left text)
  | URI {uriAuthority = Just host} <- pageURL referer = do
    -- Save this per-domain setting
    dir <- (</> "domain") <$> getXdgDirectory XdgConfig "rhapsode"
    dir <- (</> "domain") <$> getXdgDirectory XdgConfig name
    createDirectoryIfMissing True dir
    Txt.writeFile (dir </> uriRegName host) $
        CSSTok.serialize $ map absolutizeCSS $ CSSTok.tokenize text


@@ 162,26 166,35 @@ parseDocument referer@Page {pageURL = uri', initCSS = css'} _ (uri, "text/css", 
  absolutizeCSS (CSSTok.Url text) | Just rel <- parseRelativeReference $ Txt.unpack text =
    CSSTok.Url $ Txt.pack $ uriToStr' $ relativeTo rel uri'
  absolutizeCSS tok = tok
parseDocument ref@Page {initCSS = css', appName = name} _ (uri, "text/csv", Left body) =
    pageForDoc css' name uri $ parseDelimitedToTable ',' body
parseDocument ref@Page {initCSS = css', appName = name} _ (uri, "text/tab-separated-values", Left body) =
    pageForDoc css' name uri $ parseDelimitedToTable '\t' body
parseDocument ref@Page {initCSS = css', appName = name} _ (uri, "text/csv", Right body) =
    pageForDoc css' name uri $ parseDelimitedToTable ',' $ utf8' body
parseDocument ref@Page {initCSS = css', appName = name} _
        (uri, "text/tab-separated-values", Right body) =
    pageForDoc css' name uri $ parseDelimitedToTable '\t' $ utf8' body

parseDocument ref sess (uri, mime, body) | mime' /= mime = parseDocument ref sess (uri, mime', body)
    where mime' = takeWhile (/= ';') mime
parseDocument Page {initCSS = css'} _ (uri, _, Left text)
    | Right doc <- XML.parseText def $ fromStrict text = pageForDoc css' uri doc
    | otherwise = pageForText css' uri text
parseDocument Page {initCSS = css'} _ (uri, _, Right bytes)
    | Right doc <- XML.parseLBS def bytes = pageForDoc css' uri doc
parseDocument Page {initCSS = css'} _ (uri, 't':'e':'x':'t':'/':_, Right bytes) =
parseDocument Page {initCSS = css', appName = name} _ (uri, _, Left text)
    | Right doc <- XML.parseText def $ fromStrict text = pageForDoc css' name uri doc
    | otherwise = pageForText css' name uri text
parseDocument Page {initCSS = css', appName = name} _ (uri, _, Right bytes)
    | Right doc <- XML.parseLBS def bytes = pageForDoc css' name uri doc
parseDocument Page {initCSS = css', appName = name} _ (uri, 't':'e':'x':'t':'/':_, Right bytes) =
    -- charset wasn't specified, so assume utf-8.
    pageForText css' uri $ utf8' bytes
parseDocument Page {initCSS = css'} sess resp@(uri, mime, _) = do
    pageForText css' name uri $ utf8' bytes
parseDocument Page {initCSS = css', appName = name} sess resp@(uri, mime, _) = do
    dir <- getCurrentDirectory -- TODO find Downloads directory.
    ret <- saveDownload nullURI {
        uriScheme = "file:",
        uriAuthority = Just (URIAuth "" "" "")
    } dir resp >>= dispatchByMIME sess mime
    pageForDoc css' uri $ HTML.parseLT $ LTxt.pack $ fromMaybe "Unsupported filetype" ret
    pageForDoc css' name uri $ HTML.parseLT $ LTxt.pack $ fromMaybe "Unsupported filetype" ret

pageForText css' uri txt = pageForDoc css' uri XML.Document {
pageForText css' appname uri txt = pageForDoc css' appname uri XML.Document {
        XML.documentPrologue = XML.Prologue [] Nothing [],
        XML.documentRoot = XML.Element {
            XML.elementName = "pre",


@@ 191,25 204,26 @@ pageForText css' uri txt = pageForDoc css' uri XML.Document {
        XML.documentEpilogue = []
    }

pageForDoc :: StyleSheet s => s -> URI -> Document -> IO (Page s)
pageForDoc css' uri doc = do
pageForDoc :: StyleSheet s => s -> String -> URI -> Document -> IO (Page s)
pageForDoc css' appname uri doc = do
    -- See if the user has configured an alternate stylesheet for this domain.
    let authorStyle = return $ html2css doc uri css'
    styles <- case uriAuthority uri of
        Nothing -> authorStyle
        Just host -> do
            dir <- getXdgDirectory XdgConfig "rhapsode"
            dir <- getXdgDirectory XdgConfig appname
            let path = dir </> "domain" </> uriRegName host
            hasAltStyle <- doesFileExist path
            if not hasAltStyle then authorStyle else parse css' <$> Txt.readFile path

    return Page {pageURL = uri, html = doc, css = styles, initCSS = css',
    return Page {pageURL = uri, html = doc, css = styles,
        initCSS = css', appName = appname,
        -- These fields are all blank, to be filled in later by logHistory & parseDocument'
        pageTitle = "", pageMIME = "", apps = [],
        backStack = [], forwardStack = [], visitedURLs = Set.empty}

logHistory hist ret@Page {pageURL = url', html = doc} = do
    dir <- getXdgDirectory XdgData "rhapsode"
logHistory hist ret@Page {pageURL = url', html = doc, appName = name} = do
    dir <- getXdgDirectory XdgData name
    createDirectoryIfMissing True dir
    now <- getCurrentTime
    let title = Txt.unpack $ getTitle $ XML.documentRoot doc


@@ 305,3 319,50 @@ parseGemini' ("```":lines) = go [] lines

parseGemini' (line:lines) = el "p" line : parseGemini' lines
parseGemini' [] = []

--------
---- TSV, CSV, etc
--------

parseDelimitedValues _ "" row rows = reverse (reverse row : rows)
parseDelimitedValues delim ('\r':.cs) row rows = parseDelimitedValues delim cs row rows
parseDelimitedValues delim ('\n':.cs) row rows = parseDelimitedValues delim cs [] (reverse row : rows)
parseDelimitedValues delim (c:.'"':.cs) row rows | c == delim =
        let (value, cs') = inner cs in parseDelimitedValues delim cs' (value:row) rows
    where
        inner (x:.y:.cs) | x == delim && y == delim = let (a, b) = inner cs in (delim `Txt.cons` a, b)
        inner (c:.cs) | c == delim = ("", cs)
            | otherwise = let (a, b) = inner cs in (c `Txt.cons` a, b)
        inner "" = ("", "")
parseDelimitedValues delim (c:.cs) row rows | c == delim =
    let (value, cs') = Txt.break (`elem` ['\r', '\n', delim]) cs
    in parseDelimitedValues delim cs' (value:row) rows
parseDelimitedValues delim cs row rows =
    let (value, cs') = Txt.break (`elem` ['\r', '\n', delim]) cs
    in parseDelimitedValues delim cs (value:row) rows

escapeDelimitedValues delim source = map (map inner) $ parseDelimitedValues delim source [] []
    where
        inner = Txt.strip . Txt.replace "\\\\" "\\" . Txt.replace "\\n" "\n" .
            Txt.replace "\\t" "\t" . Txt.replace "\\r" "\r"

parseDelimitedToTable delim source
    | (head:body) <- filter (not . null) $ escapeDelimitedValues delim source =
        XML.Document {
            XML.documentPrologue = XML.Prologue [] Nothing [],
            XML.documentRoot = XML.Element {
                XML.elementName = "table",
                XML.elementAttributes = M.empty,
                XML.elementNodes = rowToTr "th" head : map (rowToTr "td") body
            },
            XML.documentEpilogue = []
        }
    | otherwise = XML.Document { -- Empty TSV/CSV/etc
        XML.documentPrologue = XML.Prologue [] Nothing [],
        XML.documentRoot = XML.Element "table" M.empty [],
        XML.documentEpilogue = []
    }
rowToTr tagname values = XML.NodeElement $ XML.Element "tr" M.empty $ map inner values
    where
        inner = XML.NodeElement . XML.Element tagname M.empty . singleton . XML.NodeContent
        singleton a = [a]

M hurl-xml/src/Network/MIME/XML/Table.hs => hurl-xml/src/Network/MIME/XML/Table.hs +64 -6
@@ 1,19 1,24 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedStrings, PatternSynonyms, ViewPatterns #-}
module Network.MIME.XML.Table(applySort, applySortDoc, splitTable) where

import Text.XML
import Data.Text
import Data.Text as Txt
import qualified Data.Map as M

import Data.Maybe
import qualified Data.List as L
import Text.Read (readMaybe)

-- For smarter comparisons...
import Data.Time.Format (parseTimeM, defaultTimeLocale)
import Data.Time.Clock (UTCTime)
import Data.Char (isDigit)

applySortDoc :: String -> Document -> Document
applySortDoc anchor doc@Document {documentRoot = el} = doc {documentRoot = applySort anchor el}

applySort :: String -> Element -> Element
applySort ('#':'-':'r':'h':'a':'p':'s':'-':'%':anchor) el
applySort ('#':'-':'a':'r':'g':'o':'-':'%':anchor) el
    | (id', ord:col) <- L.break (`elem` ['<', '>']) anchor, Just col' <- readMaybe col =
        applySort' id' (ord == '<') col' el
applySort _ el = el


@@ 45,9 50,10 @@ applySort'' asc col el
    | otherwise = el
  where
    compareRows (TableRow a _) (TableRow b _)
        | asc = (a !! col) `compare` (b !! col)
        | otherwise = (b !! col) `compare` (a !! col)
        | asc = compareAs (a !! col) (b !! col) (comparators !! col)
        | otherwise = compareAs (b !! col) (a !! col) (comparators !! col)
    (header, _, footer) = splitTable $ elementNodes el
    comparators = tableHeadComparators header

data TableRow = TableRow { keys :: [Text], markup :: [Element] }



@@ 55,7 61,7 @@ table2sorttable Element {
        elementName = Name "table" _ _,
        elementAttributes = attrs,
        elementNodes = childs
    } | "-rhaps-unsortable" `elem` attrs, (_, body, _) <- splitTable childs =
    } | "-argo-unsortable" `notElem` attrs, (_, body, _) <- splitTable childs =
        trs2sorttable body
table2sorttable _ = Nothing



@@ 83,6 89,29 @@ splitTableBody els@(NodeElement _:_) = ([], els)
splitTableBody (_:els) = splitTableBody els
splitTableBody [] = ([], [])

tableHeadComparators :: [Node] -> [Text]
tableHeadComparators = Prelude.map (fromMaybe "alphanumeric") . tableHeadComparators'
tableHeadComparators' :: [Node] -> [Maybe Text]
tableHeadComparators' (NodeElement el@Element { elementName = Name name _ _, elementNodes = childs}:els)
    | name == "thead" = tableHeadComparators' childs `mergeRight` tableHeadComparators' els
    | name `elem` ["colgroup", "tr"] = tableRowComparators childs `mergeRight` tableHeadComparators' els
    | otherwise = tableHeadComparators' els
tableHeadComparators' [] = []
tableRowComparators :: [Node] -> [Maybe Text]
tableRowComparators (NodeElement el@(Element (Name "col" _ _) attrs _):els) =
    let colspan = fromMaybe 1 (M.lookup "span" attrs >>= readMaybe . unpack)
    in Prelude.replicate colspan (M.lookup "-argo-sortas" attrs) ++ tableRowComparators els
tableRowComparators (NodeElement el@(Element (Name n _ _) attrs _):els) | n `elem` ["td", "th"] =
    let colspan = fromMaybe 1 (M.lookup "colspan" attrs >>= readMaybe . unpack)
    in Prelude.replicate colspan (M.lookup "-argo-sortas" attrs) ++ tableRowComparators els
tableRowComparators (_:els) = tableRowComparators els
tableRowComparators [] = []
mergeRight :: [Maybe a] -> [Maybe a] -> [Maybe a]
mergeRight (_:as) (Just b:bs) = Just b : mergeRight as bs
mergeRight (a:as) (_:bs) = a : mergeRight as bs
mergeRight [] bs = bs
mergeRight as [] = as

annotateTHead (NodeElement el@Element { elementName = Name "thead" _ _, elementNodes = childs }:nodes) a c =
    NodeElement el { elementNodes = annotateTHead childs a c } : nodes
annotateTHead (NodeElement el@Element { elementName = Name "tr" _ _, elementNodes = childs }:nodes) a c =


@@ 111,6 140,8 @@ trs2sorttable _ = Nothing

tds2keys :: [Element] -> Maybe [Text]
tds2keys (el@Element {elementName = Name n _ _, elementAttributes = attrs, elementNodes = childs }:els)
    | n `elem` ["td", "th"], Just key <- "-argo-sortkey" `M.lookup` attrs, Just rest <- tds2keys els =
        Just (Prelude.replicate colspan key ++ rest)
    | n `elem` ["td", "th"], Just rest <- tds2keys els =
        Just (Prelude.replicate colspan (nodesText childs) ++ rest)
  where


@@ 151,3 182,30 @@ setAt i a ls
    go 0 (x:xs) = a x : xs
    go n (x:xs) = x : go (n-1) xs
    go _ []     = []

pattern (:.) :: Char -> Txt.Text -> Txt.Text
pattern x :. xs <- (Txt.uncons -> Just (x, xs))

infixr 5 :.

compareAs :: Text -> Text -> Text -> Ordering
--- Hueristic that readily handles both numbers & text
compareAs (a:.as) (b:.bs) "alphanumeric"
    | isDigit a && isDigit b =
        let (a', as') = Txt.break (not . isDigit) as
            (b', bs') = Txt.break (not . isDigit) bs
        in if Txt.length a' == Txt.length b' && a == b
        then compareAs as bs "alphanumeric"
        else if Txt.length a' == Txt.length b' then a `compare` b
        else Txt.length a' `compare` Txt.length b'
    | a == b = compareAs as bs "alphanumeric"
    | otherwise = a `compare` b
compareAs as bs "text" = as `compare` bs
compareAs as bs "number" = readInt as `compare` readInt bs
    where
        readInt :: Text -> Maybe Float
        readInt = readMaybe . Prelude.filter (`elem` '-':'.':['0'..'9']) . unpack
compareAs as bs fmt = readTime as `compare` readTime bs
    where
        readTime :: Text -> Maybe UTCTime
        readTime = parseTimeM True defaultTimeLocale (unpack fmt) . unpack