~alcinnz/bureaucromancy

47000929c9a4b46e8757d5daf0c23ede3fd30199 — Adrian Cochrane 1 year, 4 months ago 4e46e68
Parse <input> tags.

TODO: Parse other form input elements, <datalist>, & <label>.
2 files changed, 124 insertions(+), 11 deletions(-)

M bureaucromancy.cabal
M src/Text/HTML/Form.hs
M bureaucromancy.cabal => bureaucromancy.cabal +1 -1
@@ 72,7 72,7 @@ library

    -- Other library packages from which modules are imported.
    build-depends:    base ^>=4.16.4.0, ginger,
            bytestring, text, xml-conduit, network-uri
            bytestring, text, xml-conduit, network-uri, regex-tdfa

    -- Directories containing source files.
    hs-source-dirs:   src

M src/Text/HTML/Form.hs => src/Text/HTML/Form.hs +123 -10
@@ 1,14 1,19 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedStrings, TypeSynonymInstances, FlexibleInstances #-}
module Text.HTML.Form (Form(..), Input(..), parseElement, parseDocument) where

import Data.Text (Text)
import qualified Data.Text as Txt
import Text.XML.Cursor
import Text.XML (Document)
import Text.XML (Document, Name)

import Data.Maybe (fromMaybe)
import Data.Maybe (fromMaybe, listToMaybe, mapMaybe)
import Network.URI (parseURIReference, URI, nullURI)
import Data.Char (isDigit)
import Data.List (nub)
import Text.Read (readMaybe)

import Text.Regex.TDFA (Regex, defaultCompOpt, defaultExecOpt)
import Text.Regex.TDFA.Text (compile)

data Form = Form {
    action :: URI,


@@ 18,31 23,133 @@ data Form = Form {
    target :: Text,
    acceptCharset :: [Text],
    autocomplete :: Bool,
    name :: Text,
    formName :: Text,
    rel :: Text,
    inputs :: [Input]
}

data Input = Input
data Input = Input {
    -- Core attributes
    inputType :: Text,
    dirname :: Text,
    inputName :: Text,
    -- State
    value :: Text,
    inputAutocomplete :: Text,
    autofocus :: Bool,
    checked :: Bool,
    disabled :: Bool,
    readonly :: Bool,
    -- Input behaviour
    multiple :: Bool,
    formAction :: Maybe URI,
    formEnctype :: Maybe Text,
    formMethod :: Maybe Text,
    formValidate :: Bool,
    formTarget :: Maybe Text,
    inputMode :: Text,
    list :: [(Text, Text)],
    -- Validation
    range :: (Maybe Text, Maybe Text),
    step :: Maybe Text,
    lengthRange :: (Maybe Int, Maybe Int),
    pattern :: Maybe Regex,
    required :: Bool,
    -- Presentation
    placeholder :: Text,
    -- sort by tabindex?
    title :: Text,
    size :: Maybe Int,
    fileData :: FileSelector,
    imageData :: ImageData
}
data FileSelector = FileSelector {
    fileAccept :: [Text],
    fileCapture :: Text
}
data ImageData = ImageData {
    imgAlt :: Maybe Text,
    imgSize :: (Maybe Int, Maybe Int),
    imgSrc :: Maybe Text
}

attr :: Text -> Cursor -> Text -> Text
attr n el def | [ret] <- n `laxAttribute` el = ret
    | otherwise = def
attr' :: Text -> Cursor -> (Text -> a) -> Text -> a
attr' n el cb def = cb $ attr n el def
attr'' :: Text -> Cursor -> (String -> a) -> Text -> a
attr'' n el cb def = attr' n el (cb . Txt.unpack) def
hasAttr :: Name -> Cursor -> Bool
hasAttr n = not . null . hasAttribute n
mAttr :: Text -> Cursor -> Maybe Text
mAttr n = listToMaybe . laxAttribute n
parseElement :: Cursor -> Maybe Form
parseElement el | _:_ <- laxElement "form" el = Just Form {
        action = attr' "action" el
            (fromMaybe nullURI . parseURIReference . Txt.unpack) ".",
        action = attr'' "action" el (fromMaybe nullURI . parseURIReference) ".",
        enctype = attr "enctype" el "",
        method = attr "method" el "GET",
        validate = null $ hasAttribute "novalidate" el,
        target = attr "target" el "_self",
        acceptCharset = attr' "accept-charset" el Txt.words "utf-8",
        autocomplete = not $ null $ hasAttribute "autocomplete" el,
        name = attr "name" el "",
        autocomplete = hasAttr "autocomplete" el,
        formName = attr "name" el "",
        rel = attr "rel" el "",
        inputs = []
        inputs = mapMaybe parseInput $ queryInputs el
      }
    | otherwise = Nothing

queryInputs :: Cursor -> [Cursor]
queryInputs form = (allInputs >=> inForm) form
  where
    allInputs = last' ancestor >=> descendant >=> laxElement "input"
    inForm = check nestedInForm &++
        check (\x -> laxAttribute "form" x == laxAttribute "id" form)
    nestedInForm x = listToMaybe ((ancestor >=> laxElement "form") x) == Just form
    last' f x | ret:_ <- reverse $ f x = [ret]
        | otherwise = []
    f &++ g = \x -> nub (f x ++ g x)
parseInput :: Cursor -> Maybe Input
parseInput el | _:_ <- laxElement "input" el = Just Input {
        inputType = attr "type" el "text",
        value = attr "value" el "",
        inputAutocomplete = attr "autocomplete" el "on",
        autofocus = hasAttr "autofocus" el,
        checked = hasAttr "checked" el,
        disabled = hasAttr "disabled" el,
        readonly = hasAttr "readonly" el,
        multiple = hasAttr "multiple" el,
        dirname = attr "dirname" el "",
        inputName = attr "name" el "",
        formAction = if hasAttr "formaction" el
            then attr' "formaction" el (parseURIReference . Txt.unpack) ""
            else Nothing,
        formEnctype = mAttr "formenctype" el,
        formMethod = mAttr "formmethod" el,
        formValidate = not $ hasAttr "formnovalidate" el,
        formTarget = mAttr "formtarget" el,
        inputMode = attr "inputmode" el "text",
        list = [], -- TODO
        range = (mAttr "min" el, mAttr "max" el),
        step = mAttr "step" el,
        lengthRange = (attr'' "minlength" el readMaybe "",
            attr'' "maxLength" el readMaybe ""),
        pattern = attr' "pattern" el
            (rightToMaybe . compile defaultCompOpt defaultExecOpt) ".*",
        required = hasAttr "required" el,
        placeholder = attr "placeholder" el "",
        title = attr "title" el "",
        size = attr'' "size" el readMaybe "",
        fileData = FileSelector {
            fileAccept = attr' "accept" el Txt.words "*",
            fileCapture = attr "capture" el ""
        },
        imageData = ImageData {
            imgAlt = mAttr "alt" el,
            imgSize = (attr'' "width" el readMaybe "",
                attr'' "height" el readMaybe ""),
            imgSrc = mAttr "src" el
        }
      }
    | otherwise = Nothing



@@ 56,3 163,9 @@ parseDocument doc n
  where
    forms = orSelf descendant >=> laxElement "form"
    doc' = fromDocument doc

rightToMaybe :: Either a b -> Maybe b
rightToMaybe (Left _)  = Nothing
rightToMaybe (Right x) = Just x
instance Eq Cursor where
    a == b = node a == node b