~alcinnz/amphiarao

ref: a3014db3a647dab65d218e8fbd4c12b48df9a054 amphiarao/src/Internal/Elements.hs -rw-r--r-- 2.5 KiB
a3014db3 — Adrian Cochrane Add support for clicking <datalist> <options>s or <label>s. 3 years ago
                                                                                
38e35ef9 Adrian Cochrane
8e195046 Adrian Cochrane
38e35ef9 Adrian Cochrane
8e195046 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
38e35ef9 Adrian Cochrane
6a7dafe8 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
5eb4b914 Adrian Cochrane
b8351f83 Adrian Cochrane
8e195046 Adrian Cochrane
b8351f83 Adrian Cochrane
8e195046 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
5eb4b914 Adrian Cochrane
e4002093 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
{-# LANGUAGE OverloadedStrings, DeriveGeneric #-}
module Internal.Elements (getTitle, Find(..), find) where

import Text.XML
import qualified Data.Map as M
import Data.Text as Txt hiding (find)
import Control.Concurrent.MVar

import Internal

import Data.Aeson
import Data.Text as Txt (Text, pack, append)
import GHC.Generics

-- Selector engines
import qualified Text.XML.Cursor as X
import qualified XML.Selectors.CSS as CSS
import qualified Internal.Elements.XPath as XPath

import Network.URI (parseURIReference)
import Data.Maybe
import Data.String (fromString)

getTitle :: Session -> IO Text
getTitle session = getTitle' <$> documentRoot <$> document <$> readMVar session

getTitle' (Element "title" _ childs) = nodesText childs
getTitle' (Element "h1" _ childs) = nodesText childs
getTitle' (Element _ _ childs)
    -- FIXME: Caught Rhapsode bug repaired here, needs that filtering condition.
    | title:_ <- [getTitle' el | NodeElement el <- childs, getTitle' el /= ""] = title
    | otherwise = ""

nodesText (NodeElement (Element _ _ childs):nodes) = nodesText childs `append` nodesText nodes
nodesText (NodeContent txt:nodes) = txt `append` nodesText nodes
nodesText (_:nodes) = nodesText nodes
nodesText [] = ""

---

data Find = Find { using :: Text, value :: String } deriving Generic
instance FromJSON Find
find :: Find -> X.Cursor -> Either (Bool, String) [X.Cursor]
find (Find "css selector" sel) root = case CSS.parsePath sel of
    Right sel' -> Right $ CSS.toAxis sel' root
    Left msg -> Left (True, msg)
find (Find "link text" sel) root = Right $ allLinks (== pack sel) root
find (Find "partial link text" sel) root = Right $ allLinks (Txt.isInfixOf $ pack sel) root
find (Find "tag name" sel) root = Right (X.descendant root >>= X.checkName (== fromString sel))
find (Find "xpath" sel) root = case XPath.runXPath root $ Txt.pack sel of
    Right res -> Right res
    Left msgs -> Left (True, Prelude.unlines msgs)
find (Find type_ _) _ = Left (False, "Invalid selector type: " ++ Txt.unpack type_)

allLinks test = X.descendant X.>=>
    -- Missing some misc. elements Rhapsode treats as links
    (X.hasAttribute "src" `union` X.hasAttribute "href") X.>=>
    X.checkElement test'
  where
    test' (Element _ attrs childs) =
        isJust (parseURIReference <$> unpack <$>
            (M.lookup "src" attrs *> M.lookup "href" attrs)) ||
        -- Emulate Rhapsode's mandatory whitespace-collapse
        test (Txt.unwords $ Txt.words $ nodesText childs)

union a b cursor = a cursor ++ b cursor