~alcinnz/rhapsode

ref: 2d6a43f4853f32149d5e751dc7af7cbc6c2e7122 rhapsode/src/Input.hs -rw-r--r-- 4.0 KiB
2d6a43f4 — Adrian Cochrane Fix CSS charset detection. 4 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
{-# LANGUAGE OverloadedStrings #-}
module Input(parseArgs, ProgramCtl(..)) where

import           Data.Text.Lazy (fromStrict)
import qualified Data.Text as Txt
import           Data.Text.Encoding
import qualified Data.ByteString.Lazy as B
import qualified Text.HTML.DOM as HTML
import qualified Text.XML as XML
import           Network.URI
import           Network.URI.Fetch
import qualified Data.Map as M

import Control.Concurrent.Async
import System.IO
import System.Environment
import System.Directory
import Data.List
import Data.Default.Class

--- Commandline arguments
data ProgramCtl = ProgramCtl {
    docs :: [(URI, XML.Document)],
    outSSML :: Maybe Handle,
    outLinks :: Maybe Handle
}
parseArgs :: Session -> IO ProgramCtl
parseArgs http = do
    args <- getArgs
    let (inputs, outputs) = partition (isPrefixOf "-" . fst) $ preparseArgs args
    cwd <- getCurrentDirectory
    let base = URI {uriScheme = "file:", uriPath = cwd,
        uriAuthority = Nothing, uriQuery = "", uriFragment = ""}
    let inputs' = [(f, relativeTo uri base) | (f, Just uri) <- inputs]
    docs <- forConcurrently inputs' $ evalInput http

    outSSML <- parseSSMLout outputs
    outLinks <- parseLinksOut outputs
    return $ ProgramCtl {
        docs = [(uri, doc) | ((_, uri), doc) <- zip inputs' docs],
        outSSML = outSSML,
        outLinks = outLinks
    }

preparseArgs (flag@('-':_):val:args) | Just url <- parseURIReference val =
    (flag, Just url) : preparseArgs args
preparseArgs (flag@('+':_):args@(('+':_):_)) = (flag, Nothing) : preparseArgs args
preparseArgs (flag@('+':_):args@(('-':_):_)) = (flag, Nothing) : preparseArgs args
preparseArgs [flag@('+':_)] = [(flag, Nothing)]
preparseArgs (flag@('+':_):val:args) = (flag, parseURIReference val) : preparseArgs args
preparseArgs (val:args) | Just url <- parseURIReference val =
    ("-h", Just url) : preparseArgs args
preparseArgs [] = []
preparseArgs (arg:_) = error ("Unsupported argument " ++ arg)

parseSSMLout (("+s", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just
parseSSMLout (("+ssml", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just
parseSSMLout (("+s", Nothing):_) = return $ Just stdout
parseSSMLout (("+ssml", Nothing):_) = return $ Just stdout
parseSSMLout (_:args) = parseSSMLout args
parseSSMLout [] = return Nothing

parseLinksOut (("+l", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just
parseLinksOut (("+links", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just
parseLinksOut (("+l", Nothing):_) = return $ Just stdout
parseLinksOut (("+links", Nothing):_) = return $ Just stdout
parseLinksOut (_:args) = parseLinksOut args
parseLinksOut [] = return Nothing


evalInput http ("-h", url) = fetchDocument http ["text/html", "text/xml", "text/plain"] url
evalInput http ("-html", url) = fetchDocument http ["text/html", "text/xml", "text/plain"] url
evalInput http ("-x", url) = fetchDocument http ["text/xml", "text/html", "text/plain"] url
evalInput http ("-xml", url) = fetchDocument http ["text/xml", "text/html", "text/plain"] url
evalInput _ (flag, _) = error ("Unsupported input flag " ++ flag)

fetchDocument http mime uri = fetchURL http mime uri >>= parseDocument
parseDocument ("text/html", Left text) = return $ HTML.parseLT $ fromStrict text
parseDocument ("text/html", Right bytes) = return $ HTML.parseLBS bytes
parseDocument ("text/plain", Left text) = return $ docForText text
parseDocument ("text/plain", Right bytes) = return $ docForText $ decodeUtf8 $ B.toStrict bytes
parseDocument (_, Left text) | Right doc <- XML.parseText def $ fromStrict text = return doc
parseDocument (_, Right bytes) | Right doc <- XML.parseLBS def bytes = return doc
parseDocument (mime, _) = return $ docForText $ Txt.concat ["Unsupported MIMEtype ", Txt.pack mime]

docForText txt = XML.Document {
        XML.documentPrologue = XML.Prologue [] Nothing [],
        XML.documentRoot = XML.Element {
            XML.elementName = "pre",
            XML.elementAttributes = M.empty,
            XML.elementNodes = [XML.NodeContent txt]
        },
        XML.documentEpilogue = []
    }