{-# LANGUAGE OverloadedStrings #-}
module Input(parseArgs, ProgramCtl(..), fetchURL, convertCharset, charsets) where
import qualified Network.HTTP.Client as HTTP
import Network.HTTP.Types
import qualified Data.Text as Txt
import Data.Text.Lazy (fromStrict)
import Data.Text (Text)
import Data.Text.Encoding
import qualified Data.Text.IO as TxtIO
import qualified Data.ByteString.Lazy as B
import Data.ByteString.Lazy (ByteString)
import qualified Text.HTML.DOM as HTML
import qualified Text.XML as XML
import Network.URI
import qualified Data.Map as M
import Control.Concurrent.Async
import System.IO
import System.Environment
import System.Directory
import Data.List
import Data.Default.Class
--- Commandline arguments
data ProgramCtl = ProgramCtl {
docs :: [(URI, XML.Document)],
outSSML :: Maybe Handle,
outLinks :: Maybe Handle
}
parseArgs :: HTTP.Manager -> IO ProgramCtl
parseArgs http = do
args <- getArgs
let (inputs, outputs) = partition (isPrefixOf "-" . fst) $ preparseArgs args
cwd <- getCurrentDirectory
let base = URI {uriScheme = "file:", uriPath = cwd,
uriAuthority = Nothing, uriQuery = "", uriFragment = ""}
let inputs' = [(f, relativeTo uri base) | (f, Just uri) <- inputs]
docs <- forConcurrently inputs' $ evalInput http
outSSML <- parseSSMLout outputs
outLinks <- parseLinksOut outputs
return $ ProgramCtl {
docs = [(uri, doc) | ((_, uri), doc) <- zip inputs' docs],
outSSML = outSSML,
outLinks = outLinks
}
preparseArgs (flag@('-':_):val:args) | Just url <- parseURIReference val =
(flag, Just url) : preparseArgs args
preparseArgs (flag@('+':_):args@(('+':_):_)) = (flag, Nothing) : preparseArgs args
preparseArgs (flag@('+':_):args@(('-':_):_)) = (flag, Nothing) : preparseArgs args
preparseArgs [flag@('+':_)] = [(flag, Nothing)]
preparseArgs (flag@('+':_):val:args) = (flag, parseURIReference val) : preparseArgs args
preparseArgs (val:args) | Just url <- parseURIReference val =
("-h", Just url) : preparseArgs args
preparseArgs [] = []
preparseArgs (arg:_) = error ("Unsupported argument " ++ arg)
parseSSMLout (("+s", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just
parseSSMLout (("+ssml", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just
parseSSMLout (("+s", Nothing):_) = return $ Just stdout
parseSSMLout (("+ssml", Nothing):_) = return $ Just stdout
parseSSMLout (_:args) = parseSSMLout args
parseSSMLout [] = return Nothing
parseLinksOut (("+l", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just
parseLinksOut (("+links", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just
parseLinksOut (("+l", Nothing):_) = return $ Just stdout
parseLinksOut (("+links", Nothing):_) = return $ Just stdout
parseLinksOut (_:args) = parseLinksOut args
parseLinksOut [] = return Nothing
evalInput http ("-h", url) = fetchDocument http "text/html" url
evalInput http ("-html", url) = fetchDocument http "text/html" url
evalInput http ("-x", url) = fetchDocument http "text/xml" url
evalInput http ("-xml", url) = fetchDocument http "text/xml" url
evalInput _ (flag, _) = error ("Unsupported input flag " ++ flag)
fetchDocument http mime uri = fetchURL http mime uri >>= parseDocument
parseDocument ("text/html", Left text) = return $ HTML.parseLT $ fromStrict text
parseDocument ("text/html", Right bytes) = return $ HTML.parseLBS bytes
parseDocument ("text/plain", Left text) = return $ docForText text
parseDocument ("text/plain", Right bytes) = return $ docForText $ decodeUtf8 $ B.toStrict bytes
parseDocument (_, Left text) | Right doc <- XML.parseText def $ fromStrict text = return doc
parseDocument (_, Right bytes) | Right doc <- XML.parseLBS def bytes = return doc
parseDocument (mime, _) = return $ docForText $ Txt.concat ["Unsupported MIMEtype ", Txt.pack mime]
docForText txt = XML.Document {
XML.documentPrologue = XML.Prologue [] Nothing [],
XML.documentRoot = XML.Element {
XML.elementName = "pre",
XML.elementAttributes = M.empty,
XML.elementNodes = [XML.NodeContent txt]
},
XML.documentEpilogue = []
}
--------
fetchURL :: HTTP.Manager -> String -> URI -> IO (String, Either Text ByteString)
fetchURL http defaultMIME uri | uriScheme uri `elem` ["http:", "https:"] = do
request <- HTTP.requestFromURI uri
response <- HTTP.httpLbs request http
return $ case (
HTTP.responseBody response,
[val | ("content-type", val) <- HTTP.responseHeaders response]
) of
("", _) -> ("text/plain", Right $ B.fromStrict $ statusMessage $ HTTP.responseStatus response)
(response, (mimetype:_)) -> let mime = Txt.toLower $ decodeUtf8 mimetype
in resolveCharset (map (Txt.unpack . Txt.strip) $ Txt.splitOn ";" $ mime) response
(response, []) -> (defaultMIME, Right response)
fetchURL _ defaultMIME uri@URI {uriScheme = "file:"} = do
response <- B.readFile $ uriPath uri
return (defaultMIME, Right response)
fetchURL _ _ uri = return ("text/plain", Left $ Txt.concat ["Unsupported link type ", Txt.pack $ uriScheme uri])
resolveCharset :: [String] -> ByteString -> (String, Either Text ByteString)
resolveCharset (mime:('c':'h':'a':'r':'s':'e':'t':'=':charset):_) response =
(mime, Left $ convertCharset charset $ B.toStrict response)
resolveCharset (mime:_:params) response = resolveCharset (mime:params) response
resolveCharset [mime] response = (mime, Right $ response)
resolveCharset [] response = ("text/plain", Left "Filetype unspecified")
convertCharset "iso-8859-1" = decodeLatin1
convertCharset "latin1" = decodeLatin1
convertCharset "us-ascii" = decodeUtf8
convertCharset "utf-8" = decodeUtf8
convertCharset "utf-16be" = decodeUtf16BE
convertCharset "utf-16le" = decodeUtf16LE
convertCharset "utf-16" = decodeUtf16LE
convertCharset "utf-32be" = decodeUtf32BE
convertCharset "utf-32le" = decodeUtf32LE
convertCharset "utf-32" = decodeUtf32LE
convertCharset _ = \_ -> "Unsupported text encoding!"
charsets :: [Text]
charsets = ["iso-8859-1", "latin1", "us-ascii", "utf-8", "utf-16be", "utf-16le", "utf-16", "utf-32be", "utf-32le", "utf-32"]