~alcinnz/amphiarao

ref: 58648b6294944eec4be73156d0f4c4cc99f593d8 amphiarao/src/Internal/Elements.hs -rw-r--r-- 2.5 KiB
58648b62 — Adrian Cochrane Allow searching for a descendant of a given element. 3 years ago
                                                                                
38e35ef9 Adrian Cochrane
8e195046 Adrian Cochrane
38e35ef9 Adrian Cochrane
8e195046 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
38e35ef9 Adrian Cochrane
6a7dafe8 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
5eb4b914 Adrian Cochrane
b8351f83 Adrian Cochrane
8e195046 Adrian Cochrane
b8351f83 Adrian Cochrane
8e195046 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
5eb4b914 Adrian Cochrane
e4002093 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
{-# LANGUAGE OverloadedStrings, DeriveGeneric #-}
module Internal.Elements (getTitle, Find(..), find) where

import Text.XML
import qualified Data.Map as M
import Data.Text as Txt hiding (find)
import Control.Concurrent.MVar

import Internal

import Data.Aeson
import Data.Text as Txt (Text, pack, append)
import GHC.Generics

-- Selector engines
import qualified Text.XML.Cursor as X
import qualified XML.Selectors.CSS as CSS
import qualified Internal.Elements.XPath as XPath

import Network.URI (parseURIReference)
import Data.Maybe
import Data.String (fromString)

getTitle :: Session -> IO Text
getTitle session = getTitle' <$> documentRoot <$> document <$> readMVar session

getTitle' (Element "title" _ childs) = nodesText childs
getTitle' (Element "h1" _ childs) = nodesText childs
getTitle' (Element _ _ childs)
    -- FIXME: Caught Rhapsode bug repaired here, needs that filtering condition.
    | title:_ <- [getTitle' el | NodeElement el <- childs, getTitle' el /= ""] = title
    | otherwise = ""

nodesText (NodeElement (Element _ _ childs):nodes) = nodesText childs `append` nodesText nodes
nodesText (NodeContent txt:nodes) = txt `append` nodesText nodes
nodesText (_:nodes) = nodesText nodes
nodesText [] = ""

---

data Find = Find { using :: Text, value :: String } deriving Generic
instance FromJSON Find
find :: Find -> X.Cursor -> Either (Bool, String) [X.Cursor]
find (Find "css selector" sel) root = case CSS.parsePath sel of
    Right sel' -> Right $ CSS.toAxis sel' root
    Left msg -> Left (True, msg)
find (Find "link text" sel) root = Right $ allLinks (== pack sel) root
find (Find "partial link text" sel) root = Right $ allLinks (Txt.isInfixOf $ pack sel) root
find (Find "tag name" sel) root = Right (X.descendant root >>= X.checkName (== fromString sel))
find (Find "xpath" sel) root = case XPath.runXPath root $ Txt.pack sel of
    Right res -> Right res
    Left msgs -> Left (True, Prelude.unlines msgs)
find (Find type_ _) _ = Left (False, "Invalid selector type: " ++ Txt.unpack type_)

allLinks test = X.descendant X.>=>
    -- Missing some misc. elements Rhapsode treats as links
    (X.hasAttribute "src" `union` X.hasAttribute "href") X.>=>
    X.checkElement test'
  where
    test' (Element _ attrs childs) =
        isJust (parseURIReference <$> unpack <$>
            (M.lookup "src" attrs *> M.lookup "href" attrs)) ||
        -- Emulate Rhapsode's mandatory whitespace-collapse
        test (Txt.unwords $ Txt.words $ nodesText childs)

union a b cursor = a cursor ++ b cursor