~alcinnz/rhapsode

ref: 9f81815facf331222276656692a6a67413d6c4fe rhapsode/src/Links.hs -rw-r--r-- 1.9 KiB
9f81815f — Adrian Cochrane CSS fixes, start implementing downloads. 4 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
{-# LANGUAGE OverloadedStrings #-}
module Links(extractLinks, linkToText) where

import Text.XML
import qualified Data.Map as M
import Network.URI
import Data.Text (Text, unpack, append, pack, replace, strip)
import Data.Maybe

data Link = Link {
    label :: Text,
    title :: Text,
    href :: URI
}

linkToText :: Link -> Text
linkToText (Link label' title' href') =
    rmWs label' +++ "\t" +++ rmWs title' +++ "\t" +++ pack (show href')

rmWs text = strip $ replace "\t" " " $ replace "\n" " " text

extractLinks :: Document -> [Link]
extractLinks (Document prologue root misc) =
    extractMisc (prologueBefore prologue) ++ extractMisc (prologueAfter prologue) ++
    extractEl root ++ extractMisc misc

extractMisc :: [Miscellaneous] -> [Link]
extractMisc (MiscInstruction (Instruction target dat):misc)
    | Just uri <- parseURIReference $ unpack target = Link dat "" uri : extractMisc misc
extractMisc (_:misc) = extractMisc misc
extractMisc [] = []

extractEl el@(Element _ _ children) =
    extractElAttr el "{http://www.w3.org/1999/xlink}href" ++
    extractElAttr el "href" ++
    extractElAttr el "longdesc" ++
    extractElAttr el "src" ++
    extractNodes children

extractElAttr (Element _ attrs children) attr
        | Just val <- attr `M.lookup` attrs,
            Just uri <- parseURIReference $ unpack val = [Link label' title' uri]
        | otherwise = []
    where
        label' = nodesText children
        title' = fromMaybe "" $ M.lookup "title" attrs

extractNodes (NodeElement el:nodes) = extractEl el ++ extractNodes nodes
extractNodes (NodeInstruction instruct:nodes) =
    extractMisc [MiscInstruction instruct] ++ extractNodes nodes
extractNodes (_:nodes) = extractNodes nodes
extractNodes [] = []

(+++) = append
nodesText :: [Node] -> Text
nodesText (NodeElement (Element _ attrs children):nodes) = nodesText children +++ nodesText nodes
nodesText (NodeContent text:nodes) = text +++ nodesText nodes
nodesText (_:nodes) = nodesText nodes
nodesText [] = ""