From f605efd1872c427894e67aa40893cad9ce8ce577 Mon Sep 17 00:00:00 2001 From: Adrian Cochrane Date: Wed, 14 Sep 2022 13:13:37 +1200 Subject: [PATCH] Release v0.1 decoupled from Rhapsode, with CSV/TSV & refined tablesorting! --- hurl-xml/hurl-xml.cabal | 6 +- hurl-xml/src/Network/MIME/XML.hs | 155 +++++++++++++++++-------- hurl-xml/src/Network/MIME/XML/Table.hs | 70 ++++++++++- 3 files changed, 175 insertions(+), 56 deletions(-) diff --git a/hurl-xml/hurl-xml.cabal b/hurl-xml/hurl-xml.cabal index ef44081..7e7ad3a 100644 --- a/hurl-xml/hurl-xml.cabal +++ b/hurl-xml/hurl-xml.cabal @@ -16,7 +16,7 @@ version: 0.1.0.0 synopsis: Fetch parsed XML & possibly CSS for a URL based on MIMEtype. -- A longer description of the package. -description: API bindings between HURL, XML-Conduit, & Haskell Stylist. +description: API bindings between HURL, XML-Conduit, & Haskell Stylist. Supports HTML, XML, Gemini, TSV, CSV, or plaintext files. Can automatically extract & apply CSS stylesheets if Haskell Stylist is used, in which case CSS files cause the previous page to get restyled. -- URL for the project homepage or repository. homepage: https://rhapsode.adrian.geek.nz/ @@ -63,8 +63,8 @@ library build-depends: base >=4.12 && <4.13, text, bytestring, containers, data-default-class, time, directory, filepath, temporary, - xml-conduit, html-conduit, - network-uri, hurl, file-embed, + xml-conduit >= 1.8 && <2, html-conduit >= 1.3 && <2, + network-uri, hurl >= 2.2 && <3, file-embed >= 0.0.15 && <0.1, css-syntax, stylist-traits, xml-conduit-stylist >= 3 && <4 -- Directories containing source files. diff --git a/hurl-xml/src/Network/MIME/XML.hs b/hurl-xml/src/Network/MIME/XML.hs index f260580..a87865b 100644 --- a/hurl-xml/src/Network/MIME/XML.hs +++ b/hurl-xml/src/Network/MIME/XML.hs @@ -50,12 +50,13 @@ data Page styles = Page { backStack :: [(String, URI)], forwardStack :: [(String, URI)], -- Probably don't need an MVar here, but let's be safe! - visitedURLs :: Set Text + visitedURLs :: Set Text, + appName :: String } -loadVisited :: IO (Set Text) -loadVisited = do - dir <- getXdgDirectory XdgData "rhapsode" +loadVisited :: String -> IO (Set Text) +loadVisited appname = do + dir <- getXdgDirectory XdgData appname let path = dir "history.gmni" exists <- doesFileExist path @@ -68,33 +69,35 @@ loadVisited = do readStrict path = do s <- Prelude.readFile path; length s `seq` return s utf8' bytes = convertCharset "utf-8" $ B.toStrict bytes +aCCEPT = ["text/xml", "application/xml", "text/html", "text/gemini", + "text/csv", "text/tab-separated-values", "text/css", "text/*", "*/*"] -fetchDocument http referer mime URI { uriScheme = "action:", uriPath = "nocache" } = - fetchDocument http { cachingEnabled = False } referer mime $ pageURL referer -fetchDocument http referer mime URI { uriScheme = "action:", uriPath = "novalidate" } = - fetchDocument http { validateCertificates = False } referer mime $ pageURL referer -fetchDocument http referer mime URI { uriScheme = "action:", uriPath = "history/back" } = - fetchURL' http mime (pageURL referer') >>= parseDocument' referer' http False +fetchDocument http referer URI { uriScheme = "action:", uriPath = "nocache" } = + fetchDocument http { cachingEnabled = False } referer $ pageURL referer +fetchDocument http referer URI { uriScheme = "action:", uriPath = "novalidate" } = + fetchDocument http { validateCertificates = False } referer $ pageURL referer +fetchDocument http referer URI { uriScheme = "action:", uriPath = "history/back" } = + fetchURL' http aCCEPT (pageURL referer') >>= parseDocument' referer' http False where referer' = shiftHistory referer (-1) -fetchDocument http referer mime URI { uriScheme = "action:", uriPath = "history/forward" } = - fetchURL' http mime (pageURL referer') >>= parseDocument' referer' http False +fetchDocument http referer URI { uriScheme = "action:", uriPath = "history/forward" } = + fetchURL' http aCCEPT (pageURL referer') >>= parseDocument' referer' http False where referer' = shiftHistory referer 1 -fetchDocument http referer mime URI { +fetchDocument http referer URI { uriScheme = "action:", uriPath = 'h':'i':'s':'t':'o':'r':'y':'/':x } | Just x' <- readMaybe x, referer' <- shiftHistory referer x' = - fetchURL' http mime (pageURL referer') >>= parseDocument' referer http False -fetchDocument http referer mime URI { uriScheme = "app:", uriPath = appID } = do + fetchURL' http aCCEPT (pageURL referer') >>= parseDocument' referer http False +fetchDocument http referer URI { uriScheme = "app:", uriPath = appID } = do dispatchByApp http Application { name = "", icon = nullURI, description = "", appId = appID } (pageMIME referer) $ pageURL referer return referer -- TODO play an error or success sound -fetchDocument http referer@Page { pageURL = uri0 } mime uri@URI { uriFragment = anchor } +fetchDocument http referer@Page { pageURL = uri0 } uri@URI { uriFragment = anchor } | uri { uriFragment = "" } == uri0 { uriFragment = "" } = return referer { html = applySortDoc anchor $ html referer, pageURL = uri } -fetchDocument http referer mime uri = fetchURL' http mime uri >>= parseDocument' referer http True +fetchDocument http referer uri = fetchURL' http aCCEPT uri >>= parseDocument' referer http True shiftHistory :: Page style -> Integer -> Page style shiftHistory self 0 = self @@ -129,26 +132,27 @@ parseDocument' ref@Page {visitedURLs = hist} sess saveHist resp@(URI {uriFragmen parseDocument :: StyleSheet s => Page s -> Session -> (URI, String, Either Text B.ByteString) -> IO (Page s) parseDocument ref sess (uri, "html/x-error\t", resp) = parseDocument ref sess (uri, "text/html", resp) -parseDocument Page {initCSS = css'} _ (uri, "text/html", Left text) = - pageForDoc css' uri $ HTML.parseLT $ fromStrict text -parseDocument Page {initCSS = css'} _(uri, "text/html", Right bytes) = - pageForDoc css' uri $ HTML.parseLBS bytes -parseDocument Page {initCSS = css'} _ +parseDocument Page {initCSS = css', appName = name} _ (uri, "text/html", Left text) = + pageForDoc css' name uri $ HTML.parseLT $ fromStrict text +parseDocument Page {initCSS = css', appName = name} _(uri, "text/html", Right bytes) = + pageForDoc css' name uri $ HTML.parseLBS bytes +parseDocument Page {initCSS = css', appName = name} _ (uri, 't':'e':'x':'t':'/':'g':'e':'m':'i':'n':'i':';':'l':'a':'n':'g':'=':lang, Left text) = - pageForDoc css' uri $ parseGemini (Just lang) text -parseDocument Page {initCSS = css'} _ + pageForDoc css' name uri $ parseGemini (Just lang) text +parseDocument Page {initCSS = css', appName = name} _ (uri, 't':'e':'x':'t':'/':'g':'e':'m':'i':'n':'i':';':'l':'a':'n':'g':'=':lang, Right bytes) = - pageForDoc css' uri $ parseGemini (Just lang) $ utf8' bytes -parseDocument Page {initCSS = css'} _ (uri, "text/gemini", Left text) = - pageForDoc css' uri $ parseGemini Nothing text -parseDocument Page {initCSS = css'} _ (uri, "text/gemini", Right bytes) = - pageForDoc css' uri $ parseGemini Nothing $ utf8' bytes + pageForDoc css' name uri $ parseGemini (Just lang) $ utf8' bytes +parseDocument Page {initCSS = css', appName = name} _ (uri, "text/gemini", Left text) = + pageForDoc css' name uri $ parseGemini Nothing text +parseDocument Page {initCSS = css', appName = name} _ (uri, "text/gemini", Right bytes) = + pageForDoc css' name uri $ parseGemini Nothing $ utf8' bytes parseDocument a b (a', b'@"text/css", Right bytes) = parseDocument a b (a', b', Left $ applyCSScharset (map Txt.unpack charsets) $ B.toStrict bytes) -parseDocument referer@Page {pageURL = uri', initCSS = css'} _ (uri, "text/css", Left text) +parseDocument referer@Page {pageURL = uri', initCSS = css', appName = name} _ + (uri, "text/css", Left text) | URI {uriAuthority = Just host} <- pageURL referer = do -- Save this per-domain setting - dir <- ( "domain") <$> getXdgDirectory XdgConfig "rhapsode" + dir <- ( "domain") <$> getXdgDirectory XdgConfig name createDirectoryIfMissing True dir Txt.writeFile (dir uriRegName host) $ CSSTok.serialize $ map absolutizeCSS $ CSSTok.tokenize text @@ -162,26 +166,35 @@ parseDocument referer@Page {pageURL = uri', initCSS = css'} _ (uri, "text/css", absolutizeCSS (CSSTok.Url text) | Just rel <- parseRelativeReference $ Txt.unpack text = CSSTok.Url $ Txt.pack $ uriToStr' $ relativeTo rel uri' absolutizeCSS tok = tok +parseDocument ref@Page {initCSS = css', appName = name} _ (uri, "text/csv", Left body) = + pageForDoc css' name uri $ parseDelimitedToTable ',' body +parseDocument ref@Page {initCSS = css', appName = name} _ (uri, "text/tab-separated-values", Left body) = + pageForDoc css' name uri $ parseDelimitedToTable '\t' body +parseDocument ref@Page {initCSS = css', appName = name} _ (uri, "text/csv", Right body) = + pageForDoc css' name uri $ parseDelimitedToTable ',' $ utf8' body +parseDocument ref@Page {initCSS = css', appName = name} _ + (uri, "text/tab-separated-values", Right body) = + pageForDoc css' name uri $ parseDelimitedToTable '\t' $ utf8' body parseDocument ref sess (uri, mime, body) | mime' /= mime = parseDocument ref sess (uri, mime', body) where mime' = takeWhile (/= ';') mime -parseDocument Page {initCSS = css'} _ (uri, _, Left text) - | Right doc <- XML.parseText def $ fromStrict text = pageForDoc css' uri doc - | otherwise = pageForText css' uri text -parseDocument Page {initCSS = css'} _ (uri, _, Right bytes) - | Right doc <- XML.parseLBS def bytes = pageForDoc css' uri doc -parseDocument Page {initCSS = css'} _ (uri, 't':'e':'x':'t':'/':_, Right bytes) = +parseDocument Page {initCSS = css', appName = name} _ (uri, _, Left text) + | Right doc <- XML.parseText def $ fromStrict text = pageForDoc css' name uri doc + | otherwise = pageForText css' name uri text +parseDocument Page {initCSS = css', appName = name} _ (uri, _, Right bytes) + | Right doc <- XML.parseLBS def bytes = pageForDoc css' name uri doc +parseDocument Page {initCSS = css', appName = name} _ (uri, 't':'e':'x':'t':'/':_, Right bytes) = -- charset wasn't specified, so assume utf-8. - pageForText css' uri $ utf8' bytes -parseDocument Page {initCSS = css'} sess resp@(uri, mime, _) = do + pageForText css' name uri $ utf8' bytes +parseDocument Page {initCSS = css', appName = name} sess resp@(uri, mime, _) = do dir <- getCurrentDirectory -- TODO find Downloads directory. ret <- saveDownload nullURI { uriScheme = "file:", uriAuthority = Just (URIAuth "" "" "") } dir resp >>= dispatchByMIME sess mime - pageForDoc css' uri $ HTML.parseLT $ LTxt.pack $ fromMaybe "Unsupported filetype" ret + pageForDoc css' name uri $ HTML.parseLT $ LTxt.pack $ fromMaybe "Unsupported filetype" ret -pageForText css' uri txt = pageForDoc css' uri XML.Document { +pageForText css' appname uri txt = pageForDoc css' appname uri XML.Document { XML.documentPrologue = XML.Prologue [] Nothing [], XML.documentRoot = XML.Element { XML.elementName = "pre", @@ -191,25 +204,26 @@ pageForText css' uri txt = pageForDoc css' uri XML.Document { XML.documentEpilogue = [] } -pageForDoc :: StyleSheet s => s -> URI -> Document -> IO (Page s) -pageForDoc css' uri doc = do +pageForDoc :: StyleSheet s => s -> String -> URI -> Document -> IO (Page s) +pageForDoc css' appname uri doc = do -- See if the user has configured an alternate stylesheet for this domain. let authorStyle = return $ html2css doc uri css' styles <- case uriAuthority uri of Nothing -> authorStyle Just host -> do - dir <- getXdgDirectory XdgConfig "rhapsode" + dir <- getXdgDirectory XdgConfig appname let path = dir "domain" uriRegName host hasAltStyle <- doesFileExist path if not hasAltStyle then authorStyle else parse css' <$> Txt.readFile path - return Page {pageURL = uri, html = doc, css = styles, initCSS = css', + return Page {pageURL = uri, html = doc, css = styles, + initCSS = css', appName = appname, -- These fields are all blank, to be filled in later by logHistory & parseDocument' pageTitle = "", pageMIME = "", apps = [], backStack = [], forwardStack = [], visitedURLs = Set.empty} -logHistory hist ret@Page {pageURL = url', html = doc} = do - dir <- getXdgDirectory XdgData "rhapsode" +logHistory hist ret@Page {pageURL = url', html = doc, appName = name} = do + dir <- getXdgDirectory XdgData name createDirectoryIfMissing True dir now <- getCurrentTime let title = Txt.unpack $ getTitle $ XML.documentRoot doc @@ -305,3 +319,50 @@ parseGemini' ("```":lines) = go [] lines parseGemini' (line:lines) = el "p" line : parseGemini' lines parseGemini' [] = [] + +-------- +---- TSV, CSV, etc +-------- + +parseDelimitedValues _ "" row rows = reverse (reverse row : rows) +parseDelimitedValues delim ('\r':.cs) row rows = parseDelimitedValues delim cs row rows +parseDelimitedValues delim ('\n':.cs) row rows = parseDelimitedValues delim cs [] (reverse row : rows) +parseDelimitedValues delim (c:.'"':.cs) row rows | c == delim = + let (value, cs') = inner cs in parseDelimitedValues delim cs' (value:row) rows + where + inner (x:.y:.cs) | x == delim && y == delim = let (a, b) = inner cs in (delim `Txt.cons` a, b) + inner (c:.cs) | c == delim = ("", cs) + | otherwise = let (a, b) = inner cs in (c `Txt.cons` a, b) + inner "" = ("", "") +parseDelimitedValues delim (c:.cs) row rows | c == delim = + let (value, cs') = Txt.break (`elem` ['\r', '\n', delim]) cs + in parseDelimitedValues delim cs' (value:row) rows +parseDelimitedValues delim cs row rows = + let (value, cs') = Txt.break (`elem` ['\r', '\n', delim]) cs + in parseDelimitedValues delim cs (value:row) rows + +escapeDelimitedValues delim source = map (map inner) $ parseDelimitedValues delim source [] [] + where + inner = Txt.strip . Txt.replace "\\\\" "\\" . Txt.replace "\\n" "\n" . + Txt.replace "\\t" "\t" . Txt.replace "\\r" "\r" + +parseDelimitedToTable delim source + | (head:body) <- filter (not . null) $ escapeDelimitedValues delim source = + XML.Document { + XML.documentPrologue = XML.Prologue [] Nothing [], + XML.documentRoot = XML.Element { + XML.elementName = "table", + XML.elementAttributes = M.empty, + XML.elementNodes = rowToTr "th" head : map (rowToTr "td") body + }, + XML.documentEpilogue = [] + } + | otherwise = XML.Document { -- Empty TSV/CSV/etc + XML.documentPrologue = XML.Prologue [] Nothing [], + XML.documentRoot = XML.Element "table" M.empty [], + XML.documentEpilogue = [] + } +rowToTr tagname values = XML.NodeElement $ XML.Element "tr" M.empty $ map inner values + where + inner = XML.NodeElement . XML.Element tagname M.empty . singleton . XML.NodeContent + singleton a = [a] diff --git a/hurl-xml/src/Network/MIME/XML/Table.hs b/hurl-xml/src/Network/MIME/XML/Table.hs index b3be792..36af47d 100644 --- a/hurl-xml/src/Network/MIME/XML/Table.hs +++ b/hurl-xml/src/Network/MIME/XML/Table.hs @@ -1,19 +1,24 @@ -{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE OverloadedStrings, PatternSynonyms, ViewPatterns #-} module Network.MIME.XML.Table(applySort, applySortDoc, splitTable) where import Text.XML -import Data.Text +import Data.Text as Txt import qualified Data.Map as M import Data.Maybe import qualified Data.List as L import Text.Read (readMaybe) +-- For smarter comparisons... +import Data.Time.Format (parseTimeM, defaultTimeLocale) +import Data.Time.Clock (UTCTime) +import Data.Char (isDigit) + applySortDoc :: String -> Document -> Document applySortDoc anchor doc@Document {documentRoot = el} = doc {documentRoot = applySort anchor el} applySort :: String -> Element -> Element -applySort ('#':'-':'r':'h':'a':'p':'s':'-':'%':anchor) el +applySort ('#':'-':'a':'r':'g':'o':'-':'%':anchor) el | (id', ord:col) <- L.break (`elem` ['<', '>']) anchor, Just col' <- readMaybe col = applySort' id' (ord == '<') col' el applySort _ el = el @@ -45,9 +50,10 @@ applySort'' asc col el | otherwise = el where compareRows (TableRow a _) (TableRow b _) - | asc = (a !! col) `compare` (b !! col) - | otherwise = (b !! col) `compare` (a !! col) + | asc = compareAs (a !! col) (b !! col) (comparators !! col) + | otherwise = compareAs (b !! col) (a !! col) (comparators !! col) (header, _, footer) = splitTable $ elementNodes el + comparators = tableHeadComparators header data TableRow = TableRow { keys :: [Text], markup :: [Element] } @@ -55,7 +61,7 @@ table2sorttable Element { elementName = Name "table" _ _, elementAttributes = attrs, elementNodes = childs - } | "-rhaps-unsortable" `elem` attrs, (_, body, _) <- splitTable childs = + } | "-argo-unsortable" `notElem` attrs, (_, body, _) <- splitTable childs = trs2sorttable body table2sorttable _ = Nothing @@ -83,6 +89,29 @@ splitTableBody els@(NodeElement _:_) = ([], els) splitTableBody (_:els) = splitTableBody els splitTableBody [] = ([], []) +tableHeadComparators :: [Node] -> [Text] +tableHeadComparators = Prelude.map (fromMaybe "alphanumeric") . tableHeadComparators' +tableHeadComparators' :: [Node] -> [Maybe Text] +tableHeadComparators' (NodeElement el@Element { elementName = Name name _ _, elementNodes = childs}:els) + | name == "thead" = tableHeadComparators' childs `mergeRight` tableHeadComparators' els + | name `elem` ["colgroup", "tr"] = tableRowComparators childs `mergeRight` tableHeadComparators' els + | otherwise = tableHeadComparators' els +tableHeadComparators' [] = [] +tableRowComparators :: [Node] -> [Maybe Text] +tableRowComparators (NodeElement el@(Element (Name "col" _ _) attrs _):els) = + let colspan = fromMaybe 1 (M.lookup "span" attrs >>= readMaybe . unpack) + in Prelude.replicate colspan (M.lookup "-argo-sortas" attrs) ++ tableRowComparators els +tableRowComparators (NodeElement el@(Element (Name n _ _) attrs _):els) | n `elem` ["td", "th"] = + let colspan = fromMaybe 1 (M.lookup "colspan" attrs >>= readMaybe . unpack) + in Prelude.replicate colspan (M.lookup "-argo-sortas" attrs) ++ tableRowComparators els +tableRowComparators (_:els) = tableRowComparators els +tableRowComparators [] = [] +mergeRight :: [Maybe a] -> [Maybe a] -> [Maybe a] +mergeRight (_:as) (Just b:bs) = Just b : mergeRight as bs +mergeRight (a:as) (_:bs) = a : mergeRight as bs +mergeRight [] bs = bs +mergeRight as [] = as + annotateTHead (NodeElement el@Element { elementName = Name "thead" _ _, elementNodes = childs }:nodes) a c = NodeElement el { elementNodes = annotateTHead childs a c } : nodes annotateTHead (NodeElement el@Element { elementName = Name "tr" _ _, elementNodes = childs }:nodes) a c = @@ -111,6 +140,8 @@ trs2sorttable _ = Nothing tds2keys :: [Element] -> Maybe [Text] tds2keys (el@Element {elementName = Name n _ _, elementAttributes = attrs, elementNodes = childs }:els) + | n `elem` ["td", "th"], Just key <- "-argo-sortkey" `M.lookup` attrs, Just rest <- tds2keys els = + Just (Prelude.replicate colspan key ++ rest) | n `elem` ["td", "th"], Just rest <- tds2keys els = Just (Prelude.replicate colspan (nodesText childs) ++ rest) where @@ -151,3 +182,30 @@ setAt i a ls go 0 (x:xs) = a x : xs go n (x:xs) = x : go (n-1) xs go _ [] = [] + +pattern (:.) :: Char -> Txt.Text -> Txt.Text +pattern x :. xs <- (Txt.uncons -> Just (x, xs)) + +infixr 5 :. + +compareAs :: Text -> Text -> Text -> Ordering +--- Hueristic that readily handles both numbers & text +compareAs (a:.as) (b:.bs) "alphanumeric" + | isDigit a && isDigit b = + let (a', as') = Txt.break (not . isDigit) as + (b', bs') = Txt.break (not . isDigit) bs + in if Txt.length a' == Txt.length b' && a == b + then compareAs as bs "alphanumeric" + else if Txt.length a' == Txt.length b' then a `compare` b + else Txt.length a' `compare` Txt.length b' + | a == b = compareAs as bs "alphanumeric" + | otherwise = a `compare` b +compareAs as bs "text" = as `compare` bs +compareAs as bs "number" = readInt as `compare` readInt bs + where + readInt :: Text -> Maybe Float + readInt = readMaybe . Prelude.filter (`elem` '-':'.':['0'..'9']) . unpack +compareAs as bs fmt = readTime as `compare` readTime bs + where + readTime :: Text -> Maybe UTCTime + readTime = parseTimeM True defaultTimeLocale (unpack fmt) . unpack -- 2.30.2