From c900dfa44910044796d0fa99bc987d4d7c2dbd4e Mon Sep 17 00:00:00 2001 From: Adrian Cochrane Date: Sat, 11 Jan 2020 20:11:17 +1300 Subject: [PATCH] Add new URL & commandline arguments input interface. --- rhapsode.cabal | 9 ++-- src/Input.hs | 128 +++++++++++++++++++++++++++++++++++++++++++++++++ src/Main.hs | 41 ++++++++++------ 3 files changed, 160 insertions(+), 18 deletions(-) create mode 100644 src/Input.hs diff --git a/rhapsode.cabal b/rhapsode.cabal index da09dcd..a1000f6 100644 --- a/rhapsode.cabal +++ b/rhapsode.cabal @@ -60,11 +60,12 @@ executable rhapsode -- other-extensions: -- Other library packages from which modules are imported. - build-depends: base >=4.9 && <4.10, directory, - http-client, http-client-tls, bytestring, - html-conduit, xml-conduit, text, containers, + build-depends: base >=4.9 && <=4.12, directory, + http-client, http-client-tls, bytestring, http-types, + html-conduit, xml-conduit, text, containers, data-default-class, network-uri, - stylist >= 1.1, css-syntax, xml-conduit-stylist, scientific + stylist >= 1.1, css-syntax, xml-conduit-stylist, scientific, + async -- Directories containing source files. hs-source-dirs: src diff --git a/src/Input.hs b/src/Input.hs new file mode 100644 index 0000000..e4142a5 --- /dev/null +++ b/src/Input.hs @@ -0,0 +1,128 @@ +{-# LANGUAGE OverloadedStrings #-} +module Input(parseArgs, ProgramCtl(..), fetchURL, convertCharset, charsets) where + +import qualified Network.HTTP.Client as HTTP +import Network.HTTP.Types +import qualified Data.Text as Txt +import Data.Text.Lazy (fromStrict) +import Data.Text (Text) +import Data.Text.Encoding +import qualified Data.Text.IO as TxtIO +import qualified Data.ByteString.Lazy as B +import Data.ByteString.Lazy (ByteString) +import qualified Text.HTML.DOM as HTML +import qualified Text.XML as XML +import Network.URI +import qualified Data.Map as M + +import Control.Concurrent.Async + +import System.IO +import System.Environment +import System.Directory +import Data.List +import Data.Default.Class + +--- Commandline arguments +data ProgramCtl = ProgramCtl { + docs :: [(URI, XML.Document)], + outSSML :: Handle +} +parseArgs :: HTTP.Manager -> IO ProgramCtl +parseArgs http = do + args <- getArgs + let (inputs, outputs) = partition (isPrefixOf "-" . fst) $ preparseArgs args + cwd <- getCurrentDirectory + let base = URI {uriScheme = "file:", uriPath = cwd, + uriAuthority = Nothing, uriQuery = "", uriFragment = ""} + let inputs' = [(f, relativeTo uri base) | (f, Just uri) <- inputs] + docs <- forConcurrently inputs' $ evalInput http + outSSML <- parseSSMLout outputs + return $ ProgramCtl { + docs = [(uri, doc) | ((_, uri), doc) <- zip inputs' docs], + outSSML = outSSML + } + +preparseArgs (flag@('-':_):val:args) | Just url <- parseURIReference val = + (flag, Just url) : preparseArgs args +preparseArgs (flag@('+':_):args@(('+':_):_)) = (flag, Nothing) : preparseArgs args +preparseArgs (flag@('+':_):args@(('-':_):_)) = (flag, Nothing) : preparseArgs args +preparseArgs (flag@('+':_):val:args) = (flag, parseURIReference val) : preparseArgs args +preparseArgs (val:args) | Just url <- parseURIReference val = + ("-h", Just url) : preparseArgs args +preparseArgs [] = [] +preparseArgs (arg:_) = error ("Unsupported argument " ++ arg) + +parseSSMLout (("+s", Just uri):_) = openFile (uriPath uri) ReadMode +parseSSMLout (("+ssml", Just uri):_) = openFile (uriPath uri) ReadMode +parseSSMLout (("+s", Nothing):_) = return stdout +parseSSMLout (("+ssml", Nothing):_) = return stdout +parseSSMLout (_:args) = parseSSMLout args +parseSSMLout [] = return stdout + +evalInput http ("-h", url) = fetchDocument http "text/html" url +evalInput http ("-html", url) = fetchDocument http "text/html" url +evalInput http ("-x", url) = fetchDocument http "text/xml" url +evalInput http ("-xml", url) = fetchDocument http "text/xml" url +evalInput _ (flag, _) = error ("Unsupported input flag " ++ flag) + +fetchDocument http mime uri = fetchURL http mime uri >>= parseDocument +parseDocument ("text/html", Left text) = return $ HTML.parseLT $ fromStrict text +parseDocument ("text/html", Right bytes) = return $ HTML.parseLBS bytes +parseDocument ("text/plain", Left text) = return $ docForText text +parseDocument ("text/plain", Right bytes) = return $ docForText $ decodeUtf8 $ B.toStrict bytes +parseDocument (_, Left text) | Right doc <- XML.parseText def $ fromStrict text = return doc +parseDocument (_, Right bytes) | Right doc <- XML.parseLBS def bytes = return doc +parseDocument (mime, _) = return $ docForText $ Txt.concat ["Unsupported MIMEtype ", Txt.pack mime] + +docForText txt = XML.Document { + XML.documentPrologue = XML.Prologue [] Nothing [], + XML.documentRoot = XML.Element { + XML.elementName = "pre", + XML.elementAttributes = M.empty, + XML.elementNodes = [XML.NodeContent txt] + }, + XML.documentEpilogue = [] + } + +-------- + +fetchURL :: HTTP.Manager -> String -> URI -> IO (String, Either Text ByteString) +fetchURL http defaultMIME uri | uriScheme uri `elem` ["http:", "https:"] = do + request <- HTTP.requestFromURI uri + response <- HTTP.httpLbs request http + return $ case ( + HTTP.responseBody response, + [val | ("content-type", val) <- HTTP.responseHeaders response] + ) of + ("", _) -> ("text/plain", Right $ B.fromStrict $ statusMessage $ HTTP.responseStatus response) + (response, (mimetype:_)) -> let mime = Txt.toLower $ decodeUtf8 mimetype + in resolveCharset (map (Txt.unpack . Txt.strip) $ Txt.splitOn ";" $ mime) response + (response, []) -> (defaultMIME, Right response) + +fetchURL _ defaultMIME uri@URI {uriScheme = "file:"} = do + response <- B.readFile $ uriPath uri + return (defaultMIME, Right response) + +fetchURL _ _ uri = return ("text/plain", Left $ Txt.concat ["Unsupported link type ", Txt.pack $ uriScheme uri]) + +resolveCharset :: [String] -> ByteString -> (String, Either Text ByteString) +resolveCharset (mime:('c':'h':'a':'r':'s':'e':'t':'=':charset):_) response = + (mime, Left $ convertCharset charset $ B.toStrict response) +resolveCharset (mime:_:params) response = resolveCharset (mime:params) response +resolveCharset [mime] response = (mime, Right $ response) +resolveCharset [] response = ("text/plain", Left "Filetype unspecified") + +convertCharset "iso-8859-1" = decodeLatin1 +convertCharset "latin1" = decodeLatin1 +convertCharset "us-ascii" = decodeUtf8 +convertCharset "utf-8" = decodeUtf8 +convertCharset "utf-16be" = decodeUtf16BE +convertCharset "utf-16le" = decodeUtf16LE +convertCharset "utf-16" = decodeUtf16LE +convertCharset "utf-32be" = decodeUtf32BE +convertCharset "utf-32le" = decodeUtf32LE +convertCharset "utf-32" = decodeUtf32LE +convertCharset _ = \_ -> "Unsupported text encoding!" +charsets :: [Text] +charsets = ["iso-8859-1", "latin1", "us-ascii", "utf-8", "utf-16be", "utf-16le", "utf-16", "utf-32be", "utf-32le", "utf-32"] diff --git a/src/Main.hs b/src/Main.hs index 477ed3c..dc0e6b5 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -9,6 +9,7 @@ import Network.HTTP.Client.Internal import qualified Network.HTTP.Client.TLS as TLS import Network.URI import qualified Data.ByteString.Lazy.Char8 as C8 +import qualified Data.ByteString.Lazy as B import qualified Text.HTML.DOM as HTML import qualified Text.XML as XML @@ -26,23 +27,24 @@ import qualified Data.Map as M import Data.Scientific (toRealFloat) import Data.Maybe (fromJust, fromMaybe) import System.Directory as Dir +import Control.Monad import DefaultCSS import StyleTree import SSML +import Input +import Links main :: IO () main = do - url:_ <- getArgs - -- TODO support more URI schemes, and do nonblocking networking. This could be it's own module. - request <- HTTP.parseRequest url - manager <- HTTP.newManager TLS.tlsManagerSettings - response <- HTTP.httpLbs request manager - let html = XML.documentRoot $ HTML.parseLBS $ HTTP.responseBody response - let aboutBlank = fromJust $ parseURI "about:blank" - style <- retreiveStyles (fromMaybe aboutBlank $ parseURI url) html manager $ fromJust $ parseURI url - let transcript = stylize style html - C8.putStrLn $ renderElLBS $ styleToSSML $ applyCounters transcript + http <- HTTP.newManager TLS.tlsManagerSettings + ProgramCtl docs outSSML <- parseArgs http + forM docs $ \(uri, doc) -> do + let html = XML.documentRoot doc + style <- retreiveStyles uri doc http uri + let transcript = stylize style html + C8.putStrLn $ renderElLBS $ styleToSSML $ applyCounters transcript + return () renderElLBS el = XML.renderLBS XML.def $ XML.Document { XML.documentPrologue = XML.Prologue [] Nothing [], @@ -59,16 +61,27 @@ retreiveStyles uri html manager base = do authorStyle :: CSSCond.ConditionalStyles StyleTree authorStyle = H2C.html2css html base - loadURL url = do -- TODO parallelise. - request <- requestFromURI url - response <- HTTP.httpLbs request manager - return $ Txt.pack $ C8.unpack $ HTTP.responseBody response + loadURL url = do + response <- fetchURL manager "text/css" url + return $ case response of + ("text/css", Left text) -> text + ("text/css", Right bytes) -> applyCSScharset charsets $ B.toStrict bytes + (_, _) -> "" lowerVars "speech" = CSSCond.B True lowerVars "-rhapsode" = CSSCond.B True lowerVars _ = CSSCond.B False lowerToks _ = CSSCond.B False +applyCSScharset (charset:charsets) bytes | cssCharset (CSSTok.tokenize text) == charset = text + | otherwise = applyCSScharset charsets bytes + where text = convertCharset charset bytes +cssCharset toks | (CSSTok.AtKeyword "charset":toks') <- skipCSSspace toks, + (CSSTok.String charset:_) <- skipCSSspace toks' = charset + | otherwise = "" +skipCSSspace (CSSTok.Whitespace:toks) = skipCSSspace toks +skipCSSspace toks = toks + loadUserStyles styles = do dir <- Dir.getXdgDirectory Dir.XdgConfig "rhapsode" exists <- Dir.doesDirectoryExist dir -- 2.30.2