{-# LANGUAGE OverloadedStrings, DeriveGeneric #-} module Internal.Elements (getTitle, Find(..), find) where import Text.XML import Text.XML.HXT.DOM.TypeDefs import Data.Tree.NTree.TypeDefs (NTree(..)) import qualified Data.Map as M import Data.Text as Txt hiding (find) import Control.Concurrent.MVar import Internal import Data.Aeson import Data.Text as Txt (Text, pack, append) import GHC.Generics -- Selector engines import qualified Text.XML.Cursor as X import qualified XML.Selectors.CSS as CSS import Text.XML.HXT.XPath.XPathEval (getXPath', parseXPathExpr) import Network.URI (parseURIReference) import Data.Maybe import Data.String (fromString) getTitle :: Session -> IO Text getTitle session = getTitle' <$> documentRoot <$> document <$> readMVar session getTitle' (Element "title" _ childs) = nodesText childs getTitle' (Element "h1" _ childs) = nodesText childs getTitle' (Element _ _ childs) -- FIXME: Caught Rhapsode bug repaired here, needs that filtering condition. | title:_ <- [getTitle' el | NodeElement el <- childs, getTitle' el /= ""] = title | otherwise = "" nodesText (NodeElement (Element _ _ childs):nodes) = nodesText childs `append` nodesText nodes nodesText (NodeContent txt:nodes) = txt `append` nodesText nodes nodesText (_:nodes) = nodesText nodes nodesText [] = "" --- data Find = Find { using :: Text, value :: String } deriving Generic instance FromJSON Find find :: Find -> X.Cursor -> Either (Bool, String) [X.Cursor] find (Find "css selector" sel) root = case CSS.parsePath sel of Right sel' -> Right $ CSS.toAxis sel' root Left msg -> Left (True, msg) find (Find "link text" sel) root = Right $ allLinks (== pack sel) root find (Find "partial link text" sel) root = Right $ allLinks (Txt.isInfixOf $ pack sel) root find (Find "tag name" sel) root = Right (X.descendant root >>= X.checkName (== fromString sel)) -- XML Conduit doesn't have an XPath implementation, but HXT does. So convert back & forth! find (Find "xpath" sel) root = case parseXPathExpr sel of Right expr -> Right $ hxts2cursors root $ getXPath' expr $ conduit2hxt $ X.node root Left err -> Left (True, err) find (Find type_ _) _ = Left (False, "Invalid selector type: " ++ Txt.unpack type_) allLinks test = X.descendant X.>=> -- Missing some misc. elements Rhapsode treats as links (X.hasAttribute "src" `union` X.hasAttribute "href") X.>=> X.checkElement test' where test' (Element _ attrs childs) = isJust (parseURIReference <$> unpack <$> (M.lookup "src" attrs *> M.lookup "href" attrs)) || -- Emulate Rhapsode's mandatory whitespace-collapse test (Txt.unwords $ Txt.words $ nodesText childs) union a b cursor = a cursor ++ b cursor --- XPath Support map' = Prelude.map conduit2hxt (NodeElement (Element name attrs childs)) = NTree (XTag (n2qn name) $ map' attr2hxt $ M.toList attrs) $ map' conduit2hxt childs conduit2hxt (NodeInstruction (Instruction target dat)) = NTree (XPi (mkName $ unpack target) []) [NTree (XText $ unpack dat) []] conduit2hxt (NodeContent txt) = NTree (XText $ unpack txt) [] conduit2hxt (NodeComment txt) = NTree (XCmt $ unpack txt) [] attr2hxt (name, value) = NTree (XAttr $ n2qn name) [NTree (XText $ unpack value) []] n2qn (Name local (Just namespace) (Just prefix)) = mkQName (unpack prefix) (unpack local) (unpack namespace) n2qn (Name local Nothing (Just prefix)) = mkName (unpack prefix ++ ':' : unpack local) n2qn (Name local (Just namespace) Nothing) = mkNsName (unpack local) (unpack namespace) n2qn (Name local Nothing Nothing) = mkName $ unpack local hxt2cursor node hxt = lookup hxt [ (conduit2hxt $ X.node cursor, cursor) | cursor <- X.orSelf X.descendant node] hxts2cursors node hxts = catMaybes $ map' (hxt2cursor node) hxts