{-# LANGUAGE OverloadedStrings #-}
module Links(extractLinks, linkToText, Link(..), c_extractLinks) where
import Text.XML
import qualified Data.Map as M
import Network.URI
import Data.Text (Text, unpack, append, pack, replace, strip)
import qualified Data.Text.Foreign as FTxt
import Data.Maybe
import Types
import Foreign.StablePtr
import Foreign.C.String
import Foreign.Marshal.Array
import Control.Monad (forM)
data Link = Link {
label :: Text,
title :: Text,
href :: URI
}
linkToText :: Link -> Text
linkToText (Link label' title' href') =
rmWs label' +++ "\t" +++ rmWs title' +++ "\t" +++ pack (show href')
rmWs text = strip $ replace "\t" " " $ replace "\n" " " text
extractLinks :: Document -> [Link]
extractLinks (Document prologue root misc) =
extractMisc (prologueBefore prologue) ++ extractMisc (prologueAfter prologue) ++
extractEl root ++ extractMisc misc
extractMisc :: [Miscellaneous] -> [Link]
extractMisc (MiscInstruction (Instruction target dat):misc)
| Just uri <- parseURIReference $ unpack target = Link dat "" uri : extractMisc misc
extractMisc (_:misc) = extractMisc misc
extractMisc [] = []
extractEl el@(Element _ _ children) =
extractElAttr el "{http://www.w3.org/1999/xlink}href" ++
extractElAttr el "href" ++
extractElAttr el "longdesc" ++
extractElAttr el "src" ++
extractNodes children
extractElAttr (Element _ attrs children) attr
| Just val <- attr `M.lookup` attrs,
Just uri <- parseURIReference $ unpack val = [Link label' title' uri]
| otherwise = []
where
label' = nodesText children
title' = fromMaybe "" $ M.lookup "title" attrs
extractNodes (NodeElement el:nodes) = extractEl el ++ extractNodes nodes
extractNodes (NodeInstruction instruct:nodes) =
extractMisc [MiscInstruction instruct] ++ extractNodes nodes
extractNodes (_:nodes) = extractNodes nodes
extractNodes [] = []
(+++) = append
nodesText :: [Node] -> Text
nodesText (NodeElement (Element _ attrs children):nodes) = nodesText children +++ nodesText nodes
nodesText (NodeContent text:nodes) = text +++ nodesText nodes
nodesText (_:nodes) = nodesText nodes
nodesText [] = ""
-- C API
foreign export ccall c_extractLinks :: StablePtr Page -> IO (CArray CString)
c_extractLinks c_page = do
page <- deRefStablePtr c_page
ret <- forM (extractLinks $ html page) $ \link -> do
c_label <- text2cstring $ strip $ label link
c_title <- text2cstring $ strip $ title link
c_href <- newCString $ uriToString id (href link) ""
return [c_label, c_title, c_href]
nil <- newCString " "
newArray0 nil $ concat ret
text2cstring txt = FTxt.withCStringLen txt $ \s -> (peekCStringLen s >>= newCString)