~alcinnz/amphiarao

ref: 2e0a2343f591f1e45ba55eae2b3a06d9648c13ff amphiarao/src/Internal/Elements.hs -rw-r--r-- 3.7 KiB
2e0a2343 — Adrian Cochrane Integrate HXT for working XPath support. 2 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
{-# LANGUAGE OverloadedStrings, DeriveGeneric #-}
module Internal.Elements (getTitle, Find(..), find) where

import Text.XML
import Text.XML.HXT.DOM.TypeDefs
import Data.Tree.NTree.TypeDefs (NTree(..))
import qualified Data.Map as M
import Data.Text as Txt hiding (find)
import Control.Concurrent.MVar

import Internal

import Data.Aeson
import Data.Text as Txt (Text, pack, append)
import GHC.Generics

-- Selector engines
import qualified Text.XML.Cursor as X
import qualified XML.Selectors.CSS as CSS
import Text.XML.HXT.XPath.XPathEval (getXPath', parseXPathExpr)

import Network.URI (parseURIReference)
import Data.Maybe
import Data.String (fromString)

getTitle :: Session -> IO Text
getTitle session = getTitle' <$> documentRoot <$> document <$> readMVar session

getTitle' (Element "title" _ childs) = nodesText childs
getTitle' (Element "h1" _ childs) = nodesText childs
getTitle' (Element _ _ childs)
    -- FIXME: Caught Rhapsode bug repaired here, needs that filtering condition.
    | title:_ <- [getTitle' el | NodeElement el <- childs, getTitle' el /= ""] = title
    | otherwise = ""

nodesText (NodeElement (Element _ _ childs):nodes) = nodesText childs `append` nodesText nodes
nodesText (NodeContent txt:nodes) = txt `append` nodesText nodes
nodesText (_:nodes) = nodesText nodes
nodesText [] = ""

---

data Find = Find { using :: Text, value :: String } deriving Generic
instance FromJSON Find
find :: Find -> X.Cursor -> Either (Bool, String) [X.Cursor]
find (Find "css selector" sel) root = case CSS.parsePath sel of
    Right sel' -> Right $ CSS.toAxis sel' root
    Left msg -> Left (True, msg)
find (Find "link text" sel) root = Right $ allLinks (== pack sel) root
find (Find "partial link text" sel) root = Right $ allLinks (Txt.isInfixOf $ pack sel) root
find (Find "tag name" sel) root = Right (X.descendant root >>= X.checkName (== fromString sel))
-- XML Conduit doesn't have an XPath implementation, but HXT does. So convert back & forth!
find (Find "xpath" sel) root = case parseXPathExpr sel of
    Right expr -> Right $ hxts2cursors root $ getXPath' expr $ conduit2hxt $ X.node root
    Left err -> Left (True, err)
find (Find type_ _) _ = Left (False, "Invalid selector type: " ++ Txt.unpack type_)

allLinks test = X.descendant X.>=>
    -- Missing some misc. elements Rhapsode treats as links
    (X.hasAttribute "src" `union` X.hasAttribute "href") X.>=>
    X.checkElement test'
  where
    test' (Element _ attrs childs) =
        isJust (parseURIReference <$> unpack <$>
            (M.lookup "src" attrs *> M.lookup "href" attrs)) ||
        -- Emulate Rhapsode's mandatory whitespace-collapse
        test (Txt.unwords $ Txt.words $ nodesText childs)

union a b cursor = a cursor ++ b cursor

--- XPath Support

map' = Prelude.map

conduit2hxt (NodeElement (Element name attrs childs)) =
    NTree (XTag (n2qn name) $ map' attr2hxt $ M.toList attrs) $ map' conduit2hxt childs
conduit2hxt (NodeInstruction (Instruction target dat)) =
    NTree (XPi (mkName $ unpack target) []) [NTree (XText $ unpack dat) []]
conduit2hxt (NodeContent txt) = NTree (XText $ unpack txt) []
conduit2hxt (NodeComment txt) = NTree (XCmt $ unpack txt) []

attr2hxt (name, value) = NTree (XAttr $ n2qn name) [NTree (XText $ unpack value) []]

n2qn (Name local (Just namespace) (Just prefix)) =
    mkQName (unpack prefix) (unpack local) (unpack namespace)
n2qn (Name local Nothing (Just prefix)) =
    mkName (unpack prefix ++ ':' : unpack local)
n2qn (Name local (Just namespace) Nothing) = mkNsName (unpack local) (unpack namespace)
n2qn (Name local Nothing Nothing) = mkName $ unpack local

hxt2cursor node hxt = lookup hxt [
    (conduit2hxt $ X.node cursor, cursor) | cursor <- X.orSelf X.descendant node]
hxts2cursors node hxts = catMaybes $ map' (hxt2cursor node) hxts