~alcinnz/rhapsode

ref: 06d8ec3612e8fd2d8a5a7109c4e79bb238a43bc4 rhapsode/src/Input.hs -rw-r--r-- 6.1 KiB
06d8ec36 — Adrian Cochrane Prepare API to expose to C. 4 years ago
                                                                                
c900dfa4 Adrian Cochrane
06d8ec36 Adrian Cochrane
c900dfa4 Adrian Cochrane
e41dd031 Adrian Cochrane
c900dfa4 Adrian Cochrane
e41dd031 Adrian Cochrane
c900dfa4 Adrian Cochrane
e41dd031 Adrian Cochrane
c900dfa4 Adrian Cochrane
2e1b1a76 Adrian Cochrane
c900dfa4 Adrian Cochrane
9f81815f Adrian Cochrane
c900dfa4 Adrian Cochrane
06d8ec36 Adrian Cochrane
c900dfa4 Adrian Cochrane
36f30081 Adrian Cochrane
c900dfa4 Adrian Cochrane
e41dd031 Adrian Cochrane
c900dfa4 Adrian Cochrane
36f30081 Adrian Cochrane
c900dfa4 Adrian Cochrane
36f30081 Adrian Cochrane
c900dfa4 Adrian Cochrane
36f30081 Adrian Cochrane
c900dfa4 Adrian Cochrane
36f30081 Adrian Cochrane
c900dfa4 Adrian Cochrane
36f30081 Adrian Cochrane
c900dfa4 Adrian Cochrane
36f30081 Adrian Cochrane
c900dfa4 Adrian Cochrane
e41dd031 Adrian Cochrane
c900dfa4 Adrian Cochrane
06d8ec36 Adrian Cochrane
2e1b1a76 Adrian Cochrane
c900dfa4 Adrian Cochrane
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
{-# LANGUAGE OverloadedStrings #-}
module Input(parseArgs, ProgramCtl(..), fetchURLs) where

import           Data.Text.Lazy (fromStrict)
import qualified Data.Text as Txt
import           Data.Text.Encoding
import qualified Data.ByteString.Lazy as B
import qualified Text.HTML.DOM as HTML
import qualified Text.XML as XML
import           Network.URI
import           Network.URI.Fetch
import qualified Data.Map as M

import Control.Concurrent.Async
import System.IO
import System.Environment
import System.Directory
import System.IO.Temp
import Data.List
import Data.Default.Class
import Data.Maybe (fromMaybe)
import System.FilePath

import qualified Data.CSS.Syntax.StyleSheet as CSS
import Data.CSS.Preprocessor.Conditions
import qualified Data.CSS.Syntax.Tokens as CSSTok
import qualified Data.HTML2CSS as H2C
import Network.URI.Charset

--- Commandline arguments
data ProgramCtl = ProgramCtl {
    docs :: [(URI, XML.Document)],
    outSSML :: Maybe Handle,
    outLinks :: Maybe Handle
}
parseArgs :: Session -> IO ProgramCtl
parseArgs http = do
    args <- getArgs
    let (inputs, outputs) = partition (isPrefixOf "-" . fst) $ preparseArgs args
    cwd <- getCurrentDirectory
    let base = URI {uriScheme = "file:", uriPath = cwd,
        uriAuthority = Nothing, uriQuery = "", uriFragment = ""}
    let inputs' = [(f, relativeTo uri base) | (f, Just uri) <- inputs]
    docs <- forConcurrently inputs' $ evalInput http

    outSSML <- parseSSMLout outputs
    outLinks <- parseLinksOut outputs
    return $ ProgramCtl {
        docs = [(uri, doc) | ((_, uri), doc) <- zip inputs' docs],
        outSSML = outSSML,
        outLinks = outLinks
    }

preparseArgs (flag@('-':_):val:args) | Just url <- parseURIReference val =
    (flag, Just url) : preparseArgs args
preparseArgs (flag@('+':_):args@(('+':_):_)) = (flag, Nothing) : preparseArgs args
preparseArgs (flag@('+':_):args@(('-':_):_)) = (flag, Nothing) : preparseArgs args
preparseArgs [flag@('+':_)] = [(flag, Nothing)]
preparseArgs (flag@('+':_):val:args) = (flag, parseURIReference val) : preparseArgs args
preparseArgs (val:args) | Just url <- parseURIReference val =
    ("-h", Just url) : preparseArgs args
preparseArgs [] = []
preparseArgs (arg:_) = error ("Unsupported argument " ++ arg)

parseSSMLout (("+s", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just
parseSSMLout (("+ssml", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just
parseSSMLout (("+s", Nothing):_) = return $ Just stdout
parseSSMLout (("+ssml", Nothing):_) = return $ Just stdout
parseSSMLout (_:args) = parseSSMLout args
parseSSMLout [] = return Nothing

parseLinksOut (("+l", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just
parseLinksOut (("+links", Just uri):_) = openFile (uriPath uri) WriteMode >>= return . Just
parseLinksOut (("+l", Nothing):_) = return $ Just stdout
parseLinksOut (("+links", Nothing):_) = return $ Just stdout
parseLinksOut (_:args) = parseLinksOut args
parseLinksOut [] = return Nothing


evalInput http ("-h", url) = fetchDocument http ["text/html", "text/xml", "text/plain"] url
evalInput http ("-html", url) = fetchDocument http ["text/html", "text/xml", "text/plain"] url
evalInput http ("-x", url) = fetchDocument http ["text/xml", "text/html", "text/plain"] url
evalInput http ("-xml", url) = fetchDocument http ["text/xml", "text/html", "text/plain"] url
evalInput _ (flag, _) = error ("Unsupported input flag " ++ flag)

fetchURLs session refererDoc refererURL srcs = forConcurrently srcs $ \(mime, url) -> do
    let u = relativeTo url refererURL
    let charsets' = map Txt.unpack charsets
    resp <- fetchURL session mime u
    case resp of
        ("text/css", bytes) -> let
                cssParser = CSS.parseForURL (conditionalStyles u "document") u
            in return (refererDoc, cssParser $ applyCSScharset charsets' bytes)
        _ -> parseDocument session resp >>= \doc -> return (doc, H2C.html2css doc refererURL)

applyCSScharset (charset:charsets) (Right bytes)
        | cssCharset (CSSTok.tokenize text) == Txt.pack charset = text
        | otherwise = applyCSScharset charsets $ Right bytes
    where
        text = convertCharset charset $ B.toStrict bytes
applyCSScharset _ (Right bytes) = convertCharset "utf-8" $ B.toStrict bytes
applyCSScharset _ (Left text) = text
cssCharset toks | (CSSTok.AtKeyword "charset":toks') <- skipCSSspace toks,
        (CSSTok.String charset:_) <- skipCSSspace toks' = charset
    | otherwise = ""
skipCSSspace (CSSTok.Whitespace:toks) = skipCSSspace toks
skipCSSspace toks = toks

fetchDocument http mime uri = fetchURL http mime uri >>= parseDocument http
parseDocument _ ("text/html", Left text) = return $ HTML.parseLT $ fromStrict text
parseDocument _ ("text/html", Right bytes) = return $ HTML.parseLBS bytes
parseDocument _ ("text/plain", Left text) = return $ docForText text
parseDocument _ ("text/plain", Right bytes) = return $ docForText $ decodeUtf8 $ B.toStrict bytes
parseDocument _ ("application/xhtml+xml", Left text) | Right doc <- XML.parseText def $ fromStrict text = return doc
  | otherwise = return $ docForText "Unreadable webpage!"
parseDocument _ ("application/xhtml+xml", Right bytes) | Right doc <- XML.parseLBS def bytes = return doc
  | otherwise = return $ docForText "Unreadable webpage!"
parseDocument session (mime, download) = do
    localURI <- withSystemTempFile "rhapsode-download" $ writeDownloadToFile download
    result <- dispatchByMIME session mime localURI
    case result of
        Just text -> return $ docForText $ Txt.pack text
        Nothing -> parseDocument session ("application/xhtml+xml", download)

writeDownloadToFile (Left text) file handle = do
    hPutStr handle $ Txt.unpack text
    return $ URI "file:" Nothing file "" ""
writeDownloadToFile (Right bytes) file handle = do
    B.hPut handle bytes
    return $ URI "file:" Nothing file "" ""

docForText txt = XML.Document {
        XML.documentPrologue = XML.Prologue [] Nothing [],
        XML.documentRoot = XML.Element {
            XML.elementName = "pre",
            XML.elementAttributes = M.empty,
            XML.elementNodes = [XML.NodeContent txt]
        },
        XML.documentEpilogue = []
    }