{-# LANGUAGE OverloadedStrings #-} module Links(extractLinks, linkToText, Link(..), c_extractLinks) where import Text.XML import qualified Data.Map as M import Network.URI import Data.Text (Text, unpack, append, pack, replace, strip) import qualified Data.Text.Foreign as FTxt import Data.Maybe import Types import Foreign.StablePtr import Foreign.C.String import Foreign.Marshal.Array import Control.Monad (forM) data Link = Link { label :: Text, title :: Text, href :: URI } linkToText :: Link -> Text linkToText (Link label' title' href') = rmWs label' +++ "\t" +++ rmWs title' +++ "\t" +++ pack (show href') rmWs text = strip $ replace "\t" " " $ replace "\n" " " text extractLinks :: Document -> [Link] extractLinks (Document prologue root misc) = extractMisc (prologueBefore prologue) ++ extractMisc (prologueAfter prologue) ++ extractEl root ++ extractMisc misc extractMisc :: [Miscellaneous] -> [Link] extractMisc (MiscInstruction (Instruction target dat):misc) | Just uri <- parseURIReference $ unpack target = Link dat "" uri : extractMisc misc extractMisc (_:misc) = extractMisc misc extractMisc [] = [] extractEl el@(Element _ _ children) = extractElAttr el "{http://www.w3.org/1999/xlink}href" ++ extractElAttr el "href" ++ extractElAttr el "longdesc" ++ extractElAttr el "src" ++ extractNodes children extractElAttr (Element _ attrs children) attr | Just val <- attr `M.lookup` attrs, Just uri <- parseURIReference $ unpack val = [Link label' title' uri] | otherwise = [] where label' = nodesText children title' = fromMaybe "" $ M.lookup "title" attrs extractNodes (NodeElement el:nodes) = extractEl el ++ extractNodes nodes extractNodes (NodeInstruction instruct:nodes) = extractMisc [MiscInstruction instruct] ++ extractNodes nodes extractNodes (_:nodes) = extractNodes nodes extractNodes [] = [] (+++) = append nodesText :: [Node] -> Text nodesText (NodeElement (Element _ attrs children):nodes) = nodesText children +++ nodesText nodes nodesText (NodeContent text:nodes) = text +++ nodesText nodes nodesText (_:nodes) = nodesText nodes nodesText [] = "" -- C API foreign export ccall c_extractLinks :: StablePtr Page -> IO (CArray CString) c_extractLinks c_page = do page <- deRefStablePtr c_page ret <- forM (extractLinks $ html page) $ \link -> do c_label <- text2cstring $ strip $ label link c_title <- text2cstring $ strip $ title link c_href <- newCString $ uriToString id (href link) "" return [c_label, c_title, c_href] nil <- newCString " " newArray0 nil $ concat ret text2cstring txt = FTxt.withCStringLen txt $ \s -> (peekCStringLen s >>= newCString)