{-# LANGUAGE OverloadedStrings #-} module Input(parseArgs, ProgramCtl(..), fetchURL, convertCharset, charsets) where import qualified Network.HTTP.Client as HTTP import Network.HTTP.Types import qualified Data.Text as Txt import Data.Text.Lazy (fromStrict) import Data.Text (Text) import Data.Text.Encoding import qualified Data.Text.IO as TxtIO import qualified Data.ByteString.Lazy as B import Data.ByteString.Lazy (ByteString) import qualified Text.HTML.DOM as HTML import qualified Text.XML as XML import Network.URI import qualified Data.Map as M import Control.Concurrent.Async import System.IO import System.Environment import System.Directory import Data.List import Data.Default.Class --- Commandline arguments data ProgramCtl = ProgramCtl { docs :: [(URI, XML.Document)], outSSML :: Maybe Handle, outLinks :: Maybe Handle } parseArgs :: HTTP.Manager -> IO ProgramCtl parseArgs http = do args <- getArgs let (inputs, outputs) = partition (isPrefixOf "-" . fst) $ preparseArgs args cwd <- getCurrentDirectory let base = URI {uriScheme = "file:", uriPath = cwd, uriAuthority = Nothing, uriQuery = "", uriFragment = ""} let inputs' = [(f, relativeTo uri base) | (f, Just uri) <- inputs] docs <- forConcurrently inputs' $ evalInput http outSSML <- parseSSMLout outputs outLinks <- parseLinksOut outputs return $ ProgramCtl { docs = [(uri, doc) | ((_, uri), doc) <- zip inputs' docs], outSSML = outSSML, outLinks = outLinks } preparseArgs (flag@('-':_):val:args) | Just url <- parseURIReference val = (flag, Just url) : preparseArgs args preparseArgs (flag@('+':_):args@(('+':_):_)) = (flag, Nothing) : preparseArgs args preparseArgs (flag@('+':_):args@(('-':_):_)) = (flag, Nothing) : preparseArgs args preparseArgs [flag@('+':_)] = [(flag, Nothing)] preparseArgs (flag@('+':_):val:args) = (flag, parseURIReference val) : preparseArgs args preparseArgs (val:args) | Just url <- parseURIReference val = ("-h", Just url) : preparseArgs args preparseArgs [] = [] preparseArgs (arg:_) = error ("Unsupported argument " ++ arg) parseSSMLout (("+s", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just parseSSMLout (("+ssml", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just parseSSMLout (("+s", Nothing):_) = return $ Just stdout parseSSMLout (("+ssml", Nothing):_) = return $ Just stdout parseSSMLout (_:args) = parseSSMLout args parseSSMLout [] = return Nothing parseLinksOut (("+l", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just parseLinksOut (("+links", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just parseLinksOut (("+l", Nothing):_) = return $ Just stdout parseLinksOut (("+links", Nothing):_) = return $ Just stdout parseLinksOut (_:args) = parseLinksOut args parseLinksOut [] = return Nothing evalInput http ("-h", url) = fetchDocument http "text/html" url evalInput http ("-html", url) = fetchDocument http "text/html" url evalInput http ("-x", url) = fetchDocument http "text/xml" url evalInput http ("-xml", url) = fetchDocument http "text/xml" url evalInput _ (flag, _) = error ("Unsupported input flag " ++ flag) fetchDocument http mime uri = fetchURL http mime uri >>= parseDocument parseDocument ("text/html", Left text) = return $ HTML.parseLT $ fromStrict text parseDocument ("text/html", Right bytes) = return $ HTML.parseLBS bytes parseDocument ("text/plain", Left text) = return $ docForText text parseDocument ("text/plain", Right bytes) = return $ docForText $ decodeUtf8 $ B.toStrict bytes parseDocument (_, Left text) | Right doc <- XML.parseText def $ fromStrict text = return doc parseDocument (_, Right bytes) | Right doc <- XML.parseLBS def bytes = return doc parseDocument (mime, _) = return $ docForText $ Txt.concat ["Unsupported MIMEtype ", Txt.pack mime] docForText txt = XML.Document { XML.documentPrologue = XML.Prologue [] Nothing [], XML.documentRoot = XML.Element { XML.elementName = "pre", XML.elementAttributes = M.empty, XML.elementNodes = [XML.NodeContent txt] }, XML.documentEpilogue = [] } -------- fetchURL :: HTTP.Manager -> String -> URI -> IO (String, Either Text ByteString) fetchURL http defaultMIME uri | uriScheme uri `elem` ["http:", "https:"] = do request <- HTTP.requestFromURI uri response <- HTTP.httpLbs request http return $ case ( HTTP.responseBody response, [val | ("content-type", val) <- HTTP.responseHeaders response] ) of ("", _) -> ("text/plain", Right $ B.fromStrict $ statusMessage $ HTTP.responseStatus response) (response, (mimetype:_)) -> let mime = Txt.toLower $ decodeUtf8 mimetype in resolveCharset (map (Txt.unpack . Txt.strip) $ Txt.splitOn ";" $ mime) response (response, []) -> (defaultMIME, Right response) fetchURL _ defaultMIME uri@URI {uriScheme = "file:"} = do response <- B.readFile $ uriPath uri return (defaultMIME, Right response) fetchURL _ _ uri = return ("text/plain", Left $ Txt.concat ["Unsupported link type ", Txt.pack $ uriScheme uri]) resolveCharset :: [String] -> ByteString -> (String, Either Text ByteString) resolveCharset (mime:('c':'h':'a':'r':'s':'e':'t':'=':charset):_) response = (mime, Left $ convertCharset charset $ B.toStrict response) resolveCharset (mime:_:params) response = resolveCharset (mime:params) response resolveCharset [mime] response = (mime, Right $ response) resolveCharset [] response = ("text/plain", Left "Filetype unspecified") convertCharset "iso-8859-1" = decodeLatin1 convertCharset "latin1" = decodeLatin1 convertCharset "us-ascii" = decodeUtf8 convertCharset "utf-8" = decodeUtf8 convertCharset "utf-16be" = decodeUtf16BE convertCharset "utf-16le" = decodeUtf16LE convertCharset "utf-16" = decodeUtf16LE convertCharset "utf-32be" = decodeUtf32BE convertCharset "utf-32le" = decodeUtf32LE convertCharset "utf-32" = decodeUtf32LE convertCharset _ = \_ -> "Unsupported text encoding!" charsets :: [Text] charsets = ["iso-8859-1", "latin1", "us-ascii", "utf-8", "utf-16be", "utf-16le", "utf-16", "utf-32be", "utf-32le", "utf-32"]