~alcinnz/rhapsode

ref: 091c7d82762d76096de04e2d3f955f5f28b9ccd8 rhapsode/src/Links.hs -rw-r--r-- 12.1 KiB
091c7d82 — Adrian Cochrane Refactor link extraction to match new link dispatch, & defer internationalization to config 1 year, 7 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Links(extractLinks, linkToText, Link(..), c_extractLinks) where

import Text.XML
import qualified Data.Map as M
import Network.MIME.Info as MIME
import Network.URI
import Data.Text (Text, unpack, append, pack, replace, strip)
import qualified Data.Text.Lazy as LTxt
import qualified Data.Text.Foreign as FTxt
import Data.Maybe
import Text.Read (readMaybe)

-- FIXME: Expose this API from HURL XML...
import Network.URI.Fetch.XML.Table (splitTable)

import Types
import Foreign.StablePtr
import Foreign.C.String
import Foreign.Marshal.Array
import Control.Monad (forM)
import Control.Exception (catch, IOException)

import System.Directory -- For locating links.xml, suggestions.gmni
import System.FilePath
import System.IO (hPrint, stderr, hGetContents) -- For error reporting, Voice2Json

-- For suggestions.gmni
import qualified Data.Set as Set
import Data.List (nub, intercalate)
import Control.Concurrent (forkIO)

-- For Voice2Json
import Data.Char
import System.Process
import Data.Aeson
import qualified Data.HashMap.Strict as HM
import qualified Data.ByteString.Lazy as LBS

data Link = Link {
    label :: Text,
    title :: Text,
    href :: URI
}

linkToText :: Link -> Text
linkToText (Link label' title' href') =
    rmWs label' +++ "\t" +++ rmWs title' +++ "\t" +++ pack (show href')

rmWs text = strip $ replace "\t" " " $ replace "\n" " " text

extractLinks :: Document -> [Link]
extractLinks (Document prologue root misc) =
    extractMisc (prologueBefore prologue) ++ extractMisc (prologueAfter prologue) ++
    extractEl [] root ++ extractMisc misc

extractMisc :: [Miscellaneous] -> [Link]
extractMisc (MiscInstruction (Instruction target dat):misc)
    | Just uri <- parseURIReference $ unpack target = Link dat "" uri : extractMisc misc
extractMisc (_:misc) = extractMisc misc
extractMisc [] = []

extractEl path el@(Element (Name "details" _ _) _ childs) =
   [Link (nodesText summary' $ nodesText childs "") "+" nullURI {
        uriFragment = '#':'.':intercalate "." (map show $ reverse path)
    } | NodeElement summary@(Element (Name "summary" _ _) _ summary') <- childs] ++
    extractNodes (0:path) childs
-- Special case for showing Appstream metadata of compatible apps.
-- Fallback for incompatible package manager UIs.
extractEl _ (Element "{https://specifications.freedesktop.org/metainfo/1.0}url" attrs childs)
    | Just label <- "{https://specifications.freedesktop.org/metainfo/1.0}type" `M.lookup` attrs,
      Just url <- parseAbsoluteURI $ unpack $ nodesText childs "" = [Link label "" url]
extractEl _ el@(Element _ attrs [])
    | Just "alternate" <- "rel" `M.lookup` attrs', Just typ <- "type" `M.lookup` attrs',
            Just val <- "href" `M.lookup` attrs', Just uri <- parseURIReference $ unpack val =
--        let Application name _ title _ = mimeInfo $ unpack typ
--        in [Link (pack name) (pack title) uri] -- FIXME `mimeInfo` freezes...
        [Link typ "" uri]
  where attrs' = M.mapKeys nameLocalName attrs
extractEl _ el@(Element (Name "link" _ _) attrs [])
    | Just "stylesheet" <- "rel" `M.lookup` attrs', Nothing <- "title" `M.lookup` attrs',
            Just val <- "href" `M.lookup` attrs', Just uri <- parseURIReference $ unpack val =
        let Application name _ title _ = mimeInfo "text/css"
        in [Link (pack name) (pack title) uri]
  where attrs' = M.mapKeys nameLocalName attrs
extractEl path (Element (Name "table" _ _) _ childs) =
    extractTable path (splitTable childs) ++ extractNodes (0:path) childs
extractEl path el@(Element _ _ children) =
    extractElAttr el "href" ++
    extractElAttr el "longdesc" ++
    extractElAttr el "src" ++
    extractNodes (0:path) children

extractElAttr (Element _ attrs children) attr
        | Just val <- attr `M.lookup` attrs',
            Just uri <- parseURIReference $ unpack val = [Link label' title' uri]
        | otherwise = []
    where
        label' = nodesText children $ M.findWithDefault "" "rel" attrs'
        title' = fromMaybe "" $ M.lookup "title" attrs'
        attrs' = M.mapKeys nameLocalName attrs

extractTable path (thead, _, _) = extractTable' path [el | NodeElement el <- thead]
extractTable' path (Element (Name "thead" _ _) _ childs:_) =
    extractTable' path [el | NodeElement el <- childs]
extractTable' path (Element (Name "tr" _ _) _ childs:_) = extractTR path 0 childs
extractTable' path (_:els) = extractTable' path els
extractTable' _ [] = []

extractTR path count (NodeElement (Element (Name name _ _) attrs childs):nodes)
    | name `elem` ["th", "td"] =
        extractTH path count ordering childs : extractTR path count' nodes
  where
    count' = count + fromMaybe 1 colspan
    colspan = readMaybe =<< unpack <$> M.lookup "colspan" attrs'
    ordering = M.lookup "aria-sort" attrs'
    attrs' = M.mapKeys nameLocalName attrs
extractTR path count (_:nodes) = extractTR path count nodes
extractTR _ _ [] = []
extractTH path count ordering nodes = Link {
        label = nodesText nodes "",
        title = pack $ show count,
        href = nullURI {
            uriFragment = '#':'-':'a':'r':'g':'o':'-':'%':
                intercalate "." [show n | n <- path] ++ o ordering : show count
        }
      }
    where
        o (Just "ascending") = '>'
        o _ = '<'

extractNodes p@(n:path) (NodeElement el:nodes) = extractEl p el ++ extractNodes (succ n:path) nodes
extractNodes path (NodeInstruction instruct:nodes) =
    extractMisc [MiscInstruction instruct] ++ extractNodes path nodes
extractNodes path (_:nodes) = extractNodes path nodes
extractNodes _ [] = []

(+++) = append
nodesText :: [Node] -> Text -> Text
nodesText (NodeElement (Element _ attrs children):nodes) def =
    nodesText children def +++ nodesText nodes def
nodesText (NodeContent text:nodes) def = text +++ nodesText nodes def
nodesText (_:nodes) def = nodesText nodes def
nodesText [] def = def

linksFromPage :: Page RhapsodeCSS -> [Link]
linksFromPage Page {
        pageURL = url',
        pageTitle = title',
        html = html',
        apps = apps',
        backStack = back', forwardStack = forward'
    } =
        [link' n desc $ URI "app:" Nothing id "" "" | Application n _ desc id <- apps'] ++
        extractLinks html'

head' (a:_) = [a]
head' [] = []
link' l t h = Link (pack l) (pack t) h

readBookmarks :: IO Document
readBookmarks = do
    dir <- getXdgDirectory XdgData "rhapsode"
    let file = dir </> "links.xml"
    exists <- doesFileExist file

    if exists then return () else do
        -- Copy defaults into userdir
        dirs <- getXdgDirectoryList XdgDataDirs
        files' <- forM dirs $ \dir' -> do
            let file' = dir' </> "rhapsode" </> "links.xml"
            exists' <- doesFileExist file'
            return $ if exists' then Just file' else Nothing
        case catMaybes files' of
            [] -> return ()
            (file':_) -> copyFileWithMetadata file' file

    exists' <- doesFileExist file
    if exists' then Text.XML.readFile def file `catch` handleInvalid else nodoc
  where
    handleInvalid err@(InvalidXMLFile _ _) = hPrint stderr err >> nodoc
    nodoc = return $ Document (Prologue [] Nothing []) (Element "empty" M.empty []) []

-- | Write out a file of most frequently encountered unvisited links.
-- Hopefully this'll help surfers rely less on YouTube, et al's hueristics.
updateSuggestions :: Page RhapsodeCSS -> IO ()
updateSuggestions page = do
    let links = extractLinks $ html page
    let domain = maybe "" show $ uriAuthority $ pageURL page

    dir <- getXdgDirectory XdgData "rhapsode"
    let path = dir </> "suggestions.gmni"
    exists <- doesFileExist path
    suggestions <- if not exists then return [] else do
        file <- readStrict path
        return [line' | line <- lines file, line'@(_:uri':_) <- [words line], not (pack uri' `Set.member` visitedURLs page)]

    let suggestions' = suggestions ++ nub [["=>", uri', domain] | link <- links,
            let uri' = uriToString id (href link) "", not (pack uri' `Set.member` visitedURLs page)]

    createDirectoryIfMissing True dir
    Prelude.writeFile path $ unlines $ map unwords suggestions'

-------
--- Voice2Json language models
-------

-- | Output links to a Voice2Json sentences.ini grammar.
outputSentences _ "" = return ()
outputSentences links dir = do
        Prelude.writeFile (dir </> "sentences.ini") $ unlines sentences
        createProcess (proc "voice2json" ["--profile", dir, "train-profile"]){ std_err = NoStream, std_out = NoStream }
        return ()
      `catch` \(_ :: IOException) -> return () -- Assume the UI has already warned Voice2Json isn't available.
    where
      sentences = "[links]" : [
            unwords $ words $ map validChar line -- Enforce valid sentences.ini syntax.
            | line@(_:_) <- map (unpack . label) links ++ map (unpack . title) links ++ map (show . href) links
        ]
      -- | Can this character appear in a sentences.ini rule without escaping?
      validChar ch | not (isAscii ch) || isSpace ch || isAlphaNum ch = ch
      validChar _ = ' '

------
--- C API
------
foreign export ccall c_extractLinks :: StablePtr (Page RhapsodeCSS) -> CString -> IO (CArray CString)

c_extractLinks c_page c_v2jProfile = do
    page <- deRefStablePtr c_page
    v2jProfile <- peekCString c_v2jProfile
    forkIO $ updateSuggestions page -- background process for useful navigation aid.
    bookmarks <- readBookmarks
    let links = linksFromPage page ++ extractLinks bookmarks
    forkIO $ outputSentences links v2jProfile
    ret <- forM links $ \link -> do
        c_label <- text2cstring $ strip $ label link
        c_title <- text2cstring $ strip $ title link
        c_href <- newCString $ uriToString id (href link) ""
        return [c_label, c_title, c_href]
    nil <- newCString " "
    newArray0 nil $ concat ret

text2cstring txt = FTxt.withCStringLen txt $ \s -> (peekCStringLen s >>= newCString)

------
--- C helper functions
------

foreign export ccall c_formatLink :: CString -> CString -> CString -> IO CString

c_formatLink c_label c_title c_url = do
    label <- ctext c_label
    title <- ctext c_title
    url <- ctext c_url

    sfx <- getXdgDirectory XdgCache "rhapsode"
    let bulletpoint_el = audio (sfx </> "bulletpoint.wav")
    let label_el = prosody [("pitch", "low")] label
    let title_el = prosody [("volume", "soft")] title
    let link_el = audio (sfx </> "link.wav")
    let url_el = style "punctuation" "all" $ style "capital_letters" "pitch" $ prosody [("rate", "fast")] url
    let root = el "speak" [] $ map NodeElement [bulletpoint_el, label_el, title_el, link_el, url_el]

    let ssml = renderText def $ Document (Prologue [] Nothing []) root []
    newCString $ LTxt.unpack ssml
  where
    ctext cstr = pack <$> peekCString cstr
    el name attrs childs = Element name (M.fromList attrs) childs
    audio src = el "audio" [("src", pack src)] []
    prosody attrs txt = el "prosody" attrs [NodeContent txt]
    style field mode inner = el "tts:style" [("field", field), ("mode", mode)] [NodeElement inner]

foreign export ccall c_dading :: IO CString

c_dading = do
    sfx <- getXdgDirectory XdgCache "rhapsode"
    let link_el = audio (sfx </> "link.wav")
    let root = el "speak" [] [NodeElement link_el]
    let ssml = renderText def $ Document (Prologue [] Nothing []) root []
    newCString $ LTxt.unpack ssml
  where
    el name attrs childs = Element name (M.fromList attrs) childs
    audio src = el "audio" [("src", pack src)] []

--- For Voice2JSON

foreign export ccall c_dataDir :: CString -> IO CString

c_dataDir c_subdir = do
    subdir <- peekCString c_subdir
    cache <- getXdgDirectory XdgData "rhapsode"
    newCString (cache </> subdir)

foreign export ccall c_recognizeIntent :: CString -> IO CString

c_recognizeIntent c_profile = do
    profile <- peekCString c_profile
    (_, Just pipe, _, _) <- createProcess (proc "voice2json" [
        "--profile", profile,
        "transcribe-stream",
        "-c", "1"]){std_out = CreatePipe}
    (_, Just out, _, _) <- createProcess (proc "voice2json" [
        "--profile", profile,
        "recognize-intent"]){std_in = UseHandle pipe, std_out = CreatePipe}
    intent <- LBS.hGetContents out
    let transcript = case decode intent of
            Just (Object obj) | Just (String txt) <- "text" `HM.lookup` obj -> unpack txt
            _ -> ""
    newCString transcript