~alcinnz/rhapsode

ref: a3dc72fecc0c1ea5705e723c2bfe06d53d57c695 rhapsode/src/Links.hs -rw-r--r-- 2.7 KiB
a3dc72fe — Adrian Cochrane Refactor Rhapsode to use new HURL APIs and more leniantly handle MIMEtypes. 4 years ago
                                                                                
36f30081 Adrian Cochrane
4777781c Adrian Cochrane
36f30081 Adrian Cochrane
4777781c Adrian Cochrane
36f30081 Adrian Cochrane
4777781c Adrian Cochrane
36f30081 Adrian Cochrane
4777781c Adrian Cochrane
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
{-# LANGUAGE OverloadedStrings #-}
module Links(extractLinks, linkToText, Link(..), c_extractLinks) where

import Text.XML
import qualified Data.Map as M
import Network.URI
import Data.Text (Text, unpack, append, pack, replace, strip)
import qualified Data.Text.Foreign as FTxt
import Data.Maybe

import Types
import Foreign.StablePtr
import Foreign.C.String
import Foreign.Marshal.Array
import Control.Monad (forM)

data Link = Link {
    label :: Text,
    title :: Text,
    href :: URI
}

linkToText :: Link -> Text
linkToText (Link label' title' href') =
    rmWs label' +++ "\t" +++ rmWs title' +++ "\t" +++ pack (show href')

rmWs text = strip $ replace "\t" " " $ replace "\n" " " text

extractLinks :: Document -> [Link]
extractLinks (Document prologue root misc) =
    extractMisc (prologueBefore prologue) ++ extractMisc (prologueAfter prologue) ++
    extractEl root ++ extractMisc misc

extractMisc :: [Miscellaneous] -> [Link]
extractMisc (MiscInstruction (Instruction target dat):misc)
    | Just uri <- parseURIReference $ unpack target = Link dat "" uri : extractMisc misc
extractMisc (_:misc) = extractMisc misc
extractMisc [] = []

extractEl el@(Element _ _ children) =
    extractElAttr el "{http://www.w3.org/1999/xlink}href" ++
    extractElAttr el "href" ++
    extractElAttr el "longdesc" ++
    extractElAttr el "src" ++
    extractNodes children

extractElAttr (Element _ attrs children) attr
        | Just val <- attr `M.lookup` attrs,
            Just uri <- parseURIReference $ unpack val = [Link label' title' uri]
        | otherwise = []
    where
        label' = nodesText children
        title' = fromMaybe "" $ M.lookup "title" attrs

extractNodes (NodeElement el:nodes) = extractEl el ++ extractNodes nodes
extractNodes (NodeInstruction instruct:nodes) =
    extractMisc [MiscInstruction instruct] ++ extractNodes nodes
extractNodes (_:nodes) = extractNodes nodes
extractNodes [] = []

(+++) = append
nodesText :: [Node] -> Text
nodesText (NodeElement (Element _ attrs children):nodes) = nodesText children +++ nodesText nodes
nodesText (NodeContent text:nodes) = text +++ nodesText nodes
nodesText (_:nodes) = nodesText nodes
nodesText [] = ""

-- C API
foreign export ccall c_extractLinks :: StablePtr Page -> IO (CArray CString)

c_extractLinks c_page = do
    page <- deRefStablePtr c_page
    ret <- forM (extractLinks $ html page) $ \link -> do
        c_label <- text2cstring $ strip $ label link
        c_title <- text2cstring $ strip $ title link
        c_href <- newCString $ uriToString id (href link) ""
        return [c_label, c_title, c_href]
    nil <- newCString " "
    newArray0 nil $ concat ret

text2cstring txt = FTxt.withCStringLen txt $ \s -> (peekCStringLen s >>= newCString)