~alcinnz/rhapsode

ref: c900dfa44910044796d0fa99bc987d4d7c2dbd4e rhapsode/src/Input.hs -rw-r--r-- 5.6 KiB
c900dfa4 — Adrian Cochrane Add new URL & commandline arguments input interface. 4 years ago
                                                                                
c900dfa4 Adrian Cochrane
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
{-# LANGUAGE OverloadedStrings #-}
module Input(parseArgs, ProgramCtl(..), fetchURL, convertCharset, charsets) where

import qualified Network.HTTP.Client as HTTP
import           Network.HTTP.Types
import qualified Data.Text as Txt
import           Data.Text.Lazy (fromStrict)
import           Data.Text (Text)
import           Data.Text.Encoding
import qualified Data.Text.IO as TxtIO
import qualified Data.ByteString.Lazy as B
import           Data.ByteString.Lazy (ByteString)
import qualified Text.HTML.DOM as HTML
import qualified Text.XML as XML
import           Network.URI
import qualified Data.Map as M

import Control.Concurrent.Async

import           System.IO
import System.Environment
import System.Directory
import Data.List
import Data.Default.Class

--- Commandline arguments
data ProgramCtl = ProgramCtl {
    docs :: [(URI, XML.Document)],
    outSSML :: Handle
}
parseArgs :: HTTP.Manager -> IO ProgramCtl
parseArgs http = do
    args <- getArgs
    let (inputs, outputs) = partition (isPrefixOf "-" . fst) $ preparseArgs args
    cwd <- getCurrentDirectory
    let base = URI {uriScheme = "file:", uriPath = cwd,
        uriAuthority = Nothing, uriQuery = "", uriFragment = ""}
    let inputs' = [(f, relativeTo uri base) | (f, Just uri) <- inputs]
    docs <- forConcurrently inputs' $ evalInput http
    outSSML <- parseSSMLout outputs
    return $ ProgramCtl {
        docs = [(uri, doc) | ((_, uri), doc) <- zip inputs' docs],
        outSSML = outSSML
    }

preparseArgs (flag@('-':_):val:args) | Just url <- parseURIReference val =
    (flag, Just url) : preparseArgs args
preparseArgs (flag@('+':_):args@(('+':_):_)) = (flag, Nothing) : preparseArgs args
preparseArgs (flag@('+':_):args@(('-':_):_)) = (flag, Nothing) : preparseArgs args
preparseArgs (flag@('+':_):val:args) = (flag, parseURIReference val) : preparseArgs args
preparseArgs (val:args) | Just url <- parseURIReference val =
    ("-h", Just url) : preparseArgs args
preparseArgs [] = []
preparseArgs (arg:_) = error ("Unsupported argument " ++ arg)

parseSSMLout (("+s", Just uri):_) = openFile (uriPath uri) ReadMode
parseSSMLout (("+ssml", Just uri):_) = openFile (uriPath uri) ReadMode
parseSSMLout (("+s", Nothing):_) = return stdout
parseSSMLout (("+ssml", Nothing):_) = return stdout
parseSSMLout (_:args) = parseSSMLout args
parseSSMLout [] = return stdout

evalInput http ("-h", url) = fetchDocument http "text/html" url
evalInput http ("-html", url) = fetchDocument http "text/html" url
evalInput http ("-x", url) = fetchDocument http "text/xml" url
evalInput http ("-xml", url) = fetchDocument http "text/xml" url
evalInput _ (flag, _) = error ("Unsupported input flag " ++ flag)

fetchDocument http mime uri = fetchURL http mime uri >>= parseDocument
parseDocument ("text/html", Left text) = return $ HTML.parseLT $ fromStrict text
parseDocument ("text/html", Right bytes) = return $ HTML.parseLBS bytes
parseDocument ("text/plain", Left text) = return $ docForText text
parseDocument ("text/plain", Right bytes) = return $ docForText $ decodeUtf8 $ B.toStrict bytes
parseDocument (_, Left text) | Right doc <- XML.parseText def $ fromStrict text = return doc
parseDocument (_, Right bytes) | Right doc <- XML.parseLBS def bytes = return doc
parseDocument (mime, _) = return $ docForText $ Txt.concat ["Unsupported MIMEtype ", Txt.pack mime]

docForText txt = XML.Document {
        XML.documentPrologue = XML.Prologue [] Nothing [],
        XML.documentRoot = XML.Element {
            XML.elementName = "pre",
            XML.elementAttributes = M.empty,
            XML.elementNodes = [XML.NodeContent txt]
        },
        XML.documentEpilogue = []
    }

--------

fetchURL :: HTTP.Manager -> String -> URI -> IO (String, Either Text ByteString)
fetchURL http defaultMIME uri | uriScheme uri `elem` ["http:", "https:"] = do
    request <- HTTP.requestFromURI uri
    response <- HTTP.httpLbs request http
    return $ case (
            HTTP.responseBody response,
            [val | ("content-type", val) <- HTTP.responseHeaders response]
      ) of
        ("", _) -> ("text/plain", Right $ B.fromStrict $ statusMessage $ HTTP.responseStatus response)
        (response, (mimetype:_)) -> let mime = Txt.toLower $ decodeUtf8 mimetype
            in resolveCharset (map (Txt.unpack . Txt.strip) $ Txt.splitOn ";" $ mime) response
        (response, []) -> (defaultMIME, Right response)

fetchURL _ defaultMIME uri@URI {uriScheme = "file:"} = do
    response <- B.readFile $ uriPath uri
    return (defaultMIME, Right response)

fetchURL _ _ uri = return ("text/plain", Left $ Txt.concat ["Unsupported link type ", Txt.pack $ uriScheme uri])

resolveCharset :: [String] -> ByteString -> (String, Either Text ByteString)
resolveCharset (mime:('c':'h':'a':'r':'s':'e':'t':'=':charset):_) response =
    (mime, Left $ convertCharset charset $ B.toStrict response)
resolveCharset (mime:_:params) response = resolveCharset (mime:params) response
resolveCharset [mime] response = (mime, Right $ response)
resolveCharset [] response = ("text/plain", Left "Filetype unspecified")

convertCharset "iso-8859-1" = decodeLatin1
convertCharset "latin1" = decodeLatin1
convertCharset "us-ascii" = decodeUtf8
convertCharset "utf-8" = decodeUtf8
convertCharset "utf-16be" = decodeUtf16BE
convertCharset "utf-16le" = decodeUtf16LE
convertCharset "utf-16" = decodeUtf16LE
convertCharset "utf-32be" = decodeUtf32BE
convertCharset "utf-32le" = decodeUtf32LE
convertCharset "utf-32" = decodeUtf32LE
convertCharset _ = \_ -> "Unsupported text encoding!"
charsets :: [Text]
charsets = ["iso-8859-1", "latin1", "us-ascii", "utf-8", "utf-16be", "utf-16le", "utf-16", "utf-32be", "utf-32le", "utf-32"]