1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
{-# LANGUAGE OverloadedStrings, DeriveGeneric #-}
module Internal.Elements (getTitle, Find(..), find) where
import Text.XML
import qualified Data.Map as M
import Data.Text as Txt hiding (find)
import Control.Concurrent.MVar
import Internal
import Data.Aeson
import Data.Text as Txt (Text, pack, append)
import GHC.Generics
-- Selector engines
import qualified Text.XML.Cursor as X
import qualified XML.Selectors.CSS as CSS
import Network.URI (parseURIReference)
import Data.Maybe
import Data.String (fromString)
getTitle :: Session -> IO Text
getTitle session = getTitle' <$> documentRoot <$> document <$> readMVar session
getTitle' (Element "title" _ childs) = nodesText childs
getTitle' (Element "h1" _ childs) = nodesText childs
getTitle' (Element _ _ childs)
-- FIXME: Caught Rhapsode bug repaired here, needs that filtering condition.
| title:_ <- [getTitle' el | NodeElement el <- childs, getTitle' el /= ""] = title
| otherwise = ""
nodesText (NodeElement (Element _ _ childs):nodes) = nodesText childs `append` nodesText nodes
nodesText (NodeContent txt:nodes) = txt `append` nodesText nodes
nodesText (_:nodes) = nodesText nodes
nodesText [] = ""
---
data Find = Find { using :: Text, value :: String } deriving Generic
instance FromJSON Find
find :: Find -> X.Cursor -> Either (Bool, String) [X.Cursor]
find (Find "css selector" sel) root = case CSS.parsePath sel of
Right sel' -> Right $ CSS.toAxis sel' root
Left msg -> Left (True, msg)
find (Find "link text" sel) root = Right $ allLinks (== pack sel) root
find (Find "partial link text" sel) root = Right $ allLinks (Txt.isInfixOf $ pack sel) root
find (Find "tag name" sel) root = Right (X.descendant root >>= X.checkName (== fromString sel))
find (Find type_ _) _ = Left (False, "Invalid selector type: " ++ Txt.unpack type_)
allLinks test = X.descendant X.>=>
-- Missing some misc. elements Rhapsode treats as links
(X.hasAttribute "src" `union` X.hasAttribute "href") X.>=>
X.checkElement test'
where
test' (Element _ attrs childs) =
isJust (parseURIReference <$> unpack <$>
(M.lookup "src" attrs *> M.lookup "href" attrs)) ||
-- Emulate Rhapsode's mandatory whitespace-collapse
test (Txt.unwords $ Txt.words $ nodesText childs)
union a b cursor = a cursor ++ b cursor