1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
{-# LANGUAGE OverloadedStrings, DeriveGeneric #-}
module Internal.Elements (getTitle, Find(..), find) where
import Text.XML
import Text.XML.HXT.DOM.TypeDefs
import Data.Tree.NTree.TypeDefs (NTree(..))
import qualified Data.Map as M
import Data.Text as Txt hiding (find)
import Control.Concurrent.MVar
import Internal
import Data.Aeson
import Data.Text as Txt (Text, pack, append)
import GHC.Generics
-- Selector engines
import qualified Text.XML.Cursor as X
import qualified XML.Selectors.CSS as CSS
import Text.XML.HXT.XPath.XPathEval (getXPath', parseXPathExpr)
import Network.URI (parseURIReference)
import Data.Maybe
import Data.String (fromString)
getTitle :: Session -> IO Text
getTitle session = getTitle' <$> documentRoot <$> document <$> readMVar session
getTitle' (Element "title" _ childs) = nodesText childs
getTitle' (Element "h1" _ childs) = nodesText childs
getTitle' (Element _ _ childs)
-- FIXME: Caught Rhapsode bug repaired here, needs that filtering condition.
| title:_ <- [getTitle' el | NodeElement el <- childs, getTitle' el /= ""] = title
| otherwise = ""
nodesText (NodeElement (Element _ _ childs):nodes) = nodesText childs `append` nodesText nodes
nodesText (NodeContent txt:nodes) = txt `append` nodesText nodes
nodesText (_:nodes) = nodesText nodes
nodesText [] = ""
---
data Find = Find { using :: Text, value :: String } deriving Generic
instance FromJSON Find
find :: Find -> X.Cursor -> Either (Bool, String) [X.Cursor]
find (Find "css selector" sel) root = case CSS.parsePath sel of
Right sel' -> Right $ CSS.toAxis sel' root
Left msg -> Left (True, msg)
find (Find "link text" sel) root = Right $ allLinks (== pack sel) root
find (Find "partial link text" sel) root = Right $ allLinks (Txt.isInfixOf $ pack sel) root
find (Find "tag name" sel) root = Right (X.descendant root >>= X.checkName (== fromString sel))
-- XML Conduit doesn't have an XPath implementation, but HXT does. So convert back & forth!
find (Find "xpath" sel) root = case parseXPathExpr sel of
Right expr -> Right $ hxts2cursors root $ getXPath' expr $ conduit2hxt $ X.node root
Left err -> Left (True, err)
find (Find type_ _) _ = Left (False, "Invalid selector type: " ++ Txt.unpack type_)
allLinks test = X.descendant X.>=>
-- Missing some misc. elements Rhapsode treats as links
(X.hasAttribute "src" `union` X.hasAttribute "href") X.>=>
X.checkElement test'
where
test' (Element _ attrs childs) =
isJust (parseURIReference <$> unpack <$>
(M.lookup "src" attrs *> M.lookup "href" attrs)) ||
-- Emulate Rhapsode's mandatory whitespace-collapse
test (Txt.unwords $ Txt.words $ nodesText childs)
union a b cursor = a cursor ++ b cursor
--- XPath Support
map' = Prelude.map
conduit2hxt (NodeElement (Element name attrs childs)) =
NTree (XTag (n2qn name) $ map' attr2hxt $ M.toList attrs) $ map' conduit2hxt childs
conduit2hxt (NodeInstruction (Instruction target dat)) =
NTree (XPi (mkName $ unpack target) []) [NTree (XText $ unpack dat) []]
conduit2hxt (NodeContent txt) = NTree (XText $ unpack txt) []
conduit2hxt (NodeComment txt) = NTree (XCmt $ unpack txt) []
attr2hxt (name, value) = NTree (XAttr $ n2qn name) [NTree (XText $ unpack value) []]
n2qn (Name local (Just namespace) (Just prefix)) =
mkQName (unpack prefix) (unpack local) (unpack namespace)
n2qn (Name local Nothing (Just prefix)) =
mkName (unpack prefix ++ ':' : unpack local)
n2qn (Name local (Just namespace) Nothing) = mkNsName (unpack local) (unpack namespace)
n2qn (Name local Nothing Nothing) = mkName $ unpack local
hxt2cursor node hxt = lookup hxt [
(conduit2hxt $ X.node cursor, cursor) | cursor <- X.orSelf X.descendant node]
hxts2cursors node hxts = catMaybes $ map' (hxt2cursor node) hxts