~alcinnz/amphiarao

ref: b8351f834a84aaae9a523cb90c57a78bbc05e39e amphiarao/src/Internal/Elements.hs -rw-r--r-- 2.1 KiB
b8351f83 — Adrian Cochrane Support searching for links by label, both in JSON & HTML interfaces 3 years ago
                                                                                
38e35ef9 Adrian Cochrane
8e195046 Adrian Cochrane
38e35ef9 Adrian Cochrane
8e195046 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
8e195046 Adrian Cochrane
b8351f83 Adrian Cochrane
8e195046 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
38e35ef9 Adrian Cochrane
b8351f83 Adrian Cochrane
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
{-# LANGUAGE OverloadedStrings, DeriveGeneric #-}
module Internal.Elements (getTitle, Find(..), find) where

import Text.XML
import qualified Data.Map as M
import Data.Text as Txt hiding (find)
import Control.Concurrent.MVar

import Internal

import Data.Aeson
import Data.Text as Txt (Text, pack, append)
import GHC.Generics

-- Selector engines
import qualified Text.XML.Cursor as X
import qualified XML.Selectors.CSS as CSS

import Network.URI (parseURIReference)
import Data.Maybe

getTitle :: Session -> IO Text
getTitle session = getTitle' <$> documentRoot <$> document <$> readMVar session

getTitle' (Element "title" _ childs) = nodesText childs
getTitle' (Element "h1" _ childs) = nodesText childs
getTitle' (Element _ _ childs)
    -- FIXME: Caught Rhapsode bug repaired here, needs that filtering condition.
    | title:_ <- [getTitle' el | NodeElement el <- childs, getTitle' el /= ""] = title
    | otherwise = ""

nodesText (NodeElement (Element _ _ childs):nodes) = nodesText childs `append` nodesText nodes
nodesText (NodeContent txt:nodes) = txt `append` nodesText nodes
nodesText (_:nodes) = nodesText nodes
nodesText [] = ""

---

data Find = Find { using :: Text, value :: String } deriving Generic
instance FromJSON Find
find :: Find -> X.Cursor -> Either (Bool, String) [X.Cursor]
find (Find "css selector" sel) root = case CSS.parsePath sel of
    Right sel' -> Right $ CSS.toAxis sel' root
    Left msg -> Left (True, msg)
find (Find "link text" sel) root = Right $ allLinks (== pack sel) root
find (Find "partial link text" sel) root = Right $ allLinks (Txt.isInfixOf $ pack sel) root
find (Find type_ _) _ = Left (False, "Invalid selector type: " ++ Txt.unpack type_)

allLinks test = X.descendant X.>=>
    -- Missing some misc. elements Rhapsode treats as links
    (X.hasAttribute "src" `union` X.hasAttribute "href") X.>=>
    X.checkElement test'
  where
    test' (Element _ attrs childs) =
        isJust (parseURIReference <$> unpack <$>
            (M.lookup "src" attrs *> M.lookup "href" attrs)) ||
        -- Emulate Rhapsode's mandatory whitespace-collapse
        test (Txt.unwords $ Txt.words $ nodesText childs)

union a b cursor = a cursor ++ b cursor