{-# LANGUAGE OverloadedStrings #-} module Links(extractLinks, linkToText, Link(..)) where import Text.XML import qualified Data.Map as M import Network.URI import Data.Text (Text, unpack, append, pack, replace, strip) import Data.Maybe data Link = Link { label :: Text, title :: Text, href :: URI } linkToText :: Link -> Text linkToText (Link label' title' href') = rmWs label' +++ "\t" +++ rmWs title' +++ "\t" +++ pack (show href') rmWs text = strip $ replace "\t" " " $ replace "\n" " " text extractLinks :: Document -> [Link] extractLinks (Document prologue root misc) = extractMisc (prologueBefore prologue) ++ extractMisc (prologueAfter prologue) ++ extractEl root ++ extractMisc misc extractMisc :: [Miscellaneous] -> [Link] extractMisc (MiscInstruction (Instruction target dat):misc) | Just uri <- parseURIReference $ unpack target = Link dat "" uri : extractMisc misc extractMisc (_:misc) = extractMisc misc extractMisc [] = [] extractEl el@(Element _ _ children) = extractElAttr el "{http://www.w3.org/1999/xlink}href" ++ extractElAttr el "href" ++ extractElAttr el "longdesc" ++ extractElAttr el "src" ++ extractNodes children extractElAttr (Element _ attrs children) attr | Just val <- attr `M.lookup` attrs, Just uri <- parseURIReference $ unpack val = [Link label' title' uri] | otherwise = [] where label' = nodesText children title' = fromMaybe "" $ M.lookup "title" attrs extractNodes (NodeElement el:nodes) = extractEl el ++ extractNodes nodes extractNodes (NodeInstruction instruct:nodes) = extractMisc [MiscInstruction instruct] ++ extractNodes nodes extractNodes (_:nodes) = extractNodes nodes extractNodes [] = [] (+++) = append nodesText :: [Node] -> Text nodesText (NodeElement (Element _ attrs children):nodes) = nodesText children +++ nodesText nodes nodesText (NodeContent text:nodes) = text +++ nodesText nodes nodesText (_:nodes) = nodesText nodes nodesText [] = ""