{-# LANGUAGE OverloadedStrings, DeriveGeneric #-} module Internal.Elements (getTitle, Find(..), find) where import Text.XML import qualified Data.Map as M import Data.Text as Txt hiding (find) import Control.Concurrent.MVar import Internal import Data.Aeson import Data.Text as Txt (Text, pack, append) import GHC.Generics -- Selector engines import qualified Text.XML.Cursor as X import qualified XML.Selectors.CSS as CSS import qualified Internal.Elements.XPath as XPath import Network.URI (parseURIReference) import Data.Maybe import Data.String (fromString) getTitle :: Session -> IO Text getTitle session = getTitle' <$> documentRoot <$> document <$> readMVar session getTitle' (Element "title" _ childs) = nodesText childs getTitle' (Element "h1" _ childs) = nodesText childs getTitle' (Element _ _ childs) -- FIXME: Caught Rhapsode bug repaired here, needs that filtering condition. | title:_ <- [getTitle' el | NodeElement el <- childs, getTitle' el /= ""] = title | otherwise = "" nodesText (NodeElement (Element _ _ childs):nodes) = nodesText childs `append` nodesText nodes nodesText (NodeContent txt:nodes) = txt `append` nodesText nodes nodesText (_:nodes) = nodesText nodes nodesText [] = "" --- data Find = Find { using :: Text, value :: String } deriving Generic instance FromJSON Find find :: Find -> X.Cursor -> Either (Bool, String) [X.Cursor] find (Find "css selector" sel) root = case CSS.parsePath sel of Right sel' -> Right $ CSS.toAxis sel' root Left msg -> Left (True, msg) find (Find "link text" sel) root = Right $ allLinks (== pack sel) root find (Find "partial link text" sel) root = Right $ allLinks (Txt.isInfixOf $ pack sel) root find (Find "tag name" sel) root = Right (X.descendant root >>= X.checkName (== fromString sel)) find (Find "xpath" sel) root = case XPath.runXPath root $ Txt.pack sel of Right res -> Right res Left msgs -> Left (True, Prelude.unlines msgs) find (Find type_ _) _ = Left (False, "Invalid selector type: " ++ Txt.unpack type_) allLinks test = X.descendant X.>=> -- Missing some misc. elements Rhapsode treats as links (X.hasAttribute "src" `union` X.hasAttribute "href") X.>=> X.checkElement test' where test' (Element _ attrs childs) = isJust (parseURIReference <$> unpack <$> (M.lookup "src" attrs *> M.lookup "href" attrs)) || -- Emulate Rhapsode's mandatory whitespace-collapse test (Txt.unwords $ Txt.words $ nodesText childs) union a b cursor = a cursor ++ b cursor