~alcinnz/hurl

12c81bf0660acab500c2c9c234803e0fd1a47a0d — Adrian Cochrane 1 year, 8 months ago 7d31eda
Bug/crash/feature fixes for submitting forms.

Now longer crashes when attempting to save cookies either in-memory or on-disk.
On-disk cookie storage is now resilient to changes in http-client internals.
HURL now handles encoding form submissions as per caller-specified MIMEtype.
HURL now supports a choice of HTTP method when submitting forms.

Bugfix means now content negotiation, caching, & form data are now actually sent to the server.
4 files changed, 258 insertions(+), 22 deletions(-)

A Main2.hs
M hurl.cabal
A src/Network/URI/CookiesDB.hs
M src/Network/URI/Fetch.hs
A Main2.hs => Main2.hs +30 -0
@@ 0,0 1,30 @@
{-# LANGUAGE OverloadedStrings #-}
module Main where

import Network.URI.Fetch
-- Input parsing
import System.Environment (getArgs)
import Network.URI (parseURI, nullURI)
import Data.Maybe (fromJust)
-- Where to save files
import System.Directory (getCurrentDirectory)
import qualified Data.ByteString.Char8 as C8

main :: IO ()
main = do
    url:encoding:args <- getArgs
    let url' = fromJust $ parseURI url
    putStrLn encoding
    session <- newSession
    dir <- getCurrentDirectory

    resp <- submitURL' session ["*/*"] url' "POST" (C8.pack encoding) $map parseArg args
    res <- saveDownload nullURI dir resp
    putStrLn $ show res

parseArg ('-':arg) | (key, '=':value) <- break (== '=') arg = (key, Left value)
    | otherwise = (arg, Left "")
parseArg ('+':arg) | (key, '=':value) <- break (== '=') arg = (key, Right value)
    | otherwise = (arg, Left "")
parseArg arg | (key, '=':value) <- break (== '=') arg = (key, Left value)
    | otherwise = (arg, Left "")

M hurl.cabal => hurl.cabal +17 -2
@@ 114,8 114,8 @@ library
  if flag(http)
    CPP-options:   -DWITH_HTTP_URI
    build-depends: http-client, http-types >= 0.12 && <0.13,
                   http-client-openssl, HsOpenSSL
    other-modules: Network.URI.Cache
                   http-client-openssl, HsOpenSSL, time, cookie
    other-modules: Network.URI.Cache, Network.URI.CookiesDB
  if flag(gemini)
    CPP-options:   -DWITH_GEMINI_URI -DWITH_RAW_CONNECTIONS
    build-depends: HsOpenSSL, openssl-streams >= 1.2 && < 1.3, io-streams >= 1.5 && < 1.6


@@ 152,3 152,18 @@ executable hurl
  default-language:    Haskell2010

  ghc-options: -threaded

executable hurl-post
  -- .hs file containing the Main module
  main-is:              Main2.hs

  -- Other library packages from which modules are imported
  build-depends:        base >= 4.9 && <5, hurl, network-uri, directory, bytestring

  -- Directories containing source files.
  hs-source-dirs:       .

  -- Base languages which the package is written in.
  default-language:     Haskell2010

  ghc-options:          -threaded

A src/Network/URI/CookiesDB.hs => src/Network/URI/CookiesDB.hs +135 -0
@@ 0,0 1,135 @@
-- | Read & write Netscape Navigator cookies format.
{-# LANGUAGE OverloadedStrings, RecordWildCards #-}
module Network.URI.CookiesDB (readCookies, writeCookies) where
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
import           Network.HTTP.Client
import           System.Directory (doesFileExist)

import           Web.Cookie (formatCookieExpires, parseCookieExpires)
import           Data.Maybe (fromMaybe, mapMaybe)
import           Data.Time.Clock (nominalDay, getCurrentTime, addUTCTime, UTCTime)

readCookies :: FilePath -> IO CookieJar
readCookies filepath = do
    exists <- doesFileExist filepath
    if exists then do
        file <- B.readFile filepath
        now <- getCurrentTime
        return $ createCookieJar $ readCookies' now file
    else return $ createCookieJar []
readCookies' :: UTCTime -> B.ByteString -> [Cookie]
readCookies' now = mapMaybe (readCookie' now) . C.lines
readCookie' :: UTCTime -> B.ByteString -> Maybe Cookie
readCookie' now = readCookie now . C.split '\t'
readCookie :: UTCTime -> [B.ByteString] -> Maybe Cookie
readCookie now [domain, _, path, secure, expiration, name, value] =
    Just Cookie {
        cookie_domain = domain,
        cookie_path = path,
        cookie_secure_only = b secure,
        cookie_expiry_time = fromMaybe (addUTCTime nominalDay now) $ parseCookieExpires expiration,
        cookie_name = name,
        cookie_value = value,

        cookie_creation_time = now,
        cookie_last_access_time = now,
        cookie_persistent = True,
        cookie_host_only = False,
        cookie_http_only = False
    }
readCookie now [domain, _, path, secure, expiration, name, value, httpOnly, session] =
    Just Cookie {
        cookie_domain = domain,
        cookie_path = path,
        cookie_secure_only = b secure,
        cookie_expiry_time = fromMaybe (addUTCTime nominalDay now) $ parseCookieExpires expiration,
        cookie_name = name,
        cookie_value = value,
        cookie_http_only = b httpOnly,
        cookie_persistent = not $ b session,

        cookie_creation_time = now,
        cookie_last_access_time = now,
        cookie_host_only = False
    }
readCookie now [domain, _, path, secure, expiration, name, value,
    httpOnly, session, sameSite] = Just Cookie {
        cookie_domain = domain,
        cookie_path = path,
        cookie_secure_only = b secure,
        cookie_expiry_time = fromMaybe (addUTCTime nominalDay now) $ parseCookieExpires expiration,
        cookie_name = name,
        cookie_value = value,
        cookie_http_only = b httpOnly,
        cookie_persistent = not $ b session,
        cookie_host_only = sameSite == "STRICT",

        cookie_creation_time = now,
        cookie_last_access_time = now
    }
readCookie now [domain, _, path, secure, expiration, name, value,
    httpOnly, session, sameSite, _] = Just Cookie {
        cookie_domain = domain,
        cookie_path = path,
        cookie_secure_only = b secure,
        cookie_expiry_time = fromMaybe (addUTCTime nominalDay now) $ parseCookieExpires expiration,
        cookie_name = name,
        cookie_value = value,
        cookie_http_only = b httpOnly,
        cookie_persistent = not $ b session,
        cookie_host_only = sameSite == "STRICT",

        cookie_creation_time = now,
        cookie_last_access_time = now
    }
readCookie now [domain, _, path, secure, expiration, name, value,
    httpOnly, session, sameSite, _, creation] = Just Cookie {
        cookie_domain = domain,
        cookie_path = path,
        cookie_secure_only = b secure,
        cookie_expiry_time = fromMaybe (addUTCTime nominalDay now) $ parseCookieExpires expiration,
        cookie_name = name,
        cookie_value = value,
        cookie_http_only = b httpOnly,
        cookie_persistent = not $ b session,
        cookie_host_only = sameSite == "STRICT",
        cookie_creation_time = fromMaybe now $ parseCookieExpires creation,
        cookie_last_access_time = fromMaybe now $ parseCookieExpires creation
    }
readCookie now (domain:_:path:secure:expiration:name:value:
    httpOnly:session:sameSite:_:creation:access:_) = Just Cookie {
        cookie_domain = domain,
        cookie_path = path,
        cookie_secure_only = b secure,
        cookie_expiry_time = fromMaybe (addUTCTime nominalDay now) $ parseCookieExpires expiration,
        cookie_name = name,
        cookie_value = value,
        cookie_http_only = b httpOnly,
        cookie_persistent = not $ b session,
        cookie_host_only = sameSite == "STRICT",
        cookie_creation_time = fromMaybe now $ parseCookieExpires creation,
        cookie_last_access_time = fromMaybe now $ parseCookieExpires access
    }
readCookie _ _ = Nothing
b "TRUE" = True
b _ = False

writeCookies :: FilePath -> CookieJar -> Bool -> IO ()
writeCookies filepath cookies isSession = do
    B.writeFile filepath $ writeCookies' isSession $ destroyCookieJar cookies
writeCookies' :: Bool -> [Cookie] -> B.ByteString
writeCookies' isSession = C.unlines . map writeCookie' . filter shouldSaveCookie
    where
        shouldSaveCookie | isSession = cookie_persistent
            | otherwise = const True
writeCookie' :: Cookie -> B.ByteString
writeCookie' Cookie {..} = C.intercalate "\t" [
    cookie_domain, "TRUE", cookie_path, b' cookie_secure_only,
    formatCookieExpires cookie_expiry_time, cookie_name, cookie_value,
    b' cookie_http_only, b' $ not cookie_persistent,
    if cookie_host_only then "STRICT" else "LAX", "MEDIUM",
    formatCookieExpires cookie_creation_time,
    formatCookieExpires cookie_last_access_time]
b' True = "TRUE"
b' False = "FALSE"

M src/Network/URI/Fetch.hs => src/Network/URI/Fetch.hs +76 -20
@@ 3,7 3,7 @@
-- | Retrieves documents for a URL, supporting multiple URL schemes that can be
-- disabled at build-time for reduced dependencies.
module Network.URI.Fetch(Session(locale, aboutPages, redirectCount, cachingEnabled), newSession,
    fetchURL, fetchURL', fetchURLs, submitURL, mimeERR, htmlERR,
    fetchURL, fetchURL', fetchURLs, submitURL, submitURL', mimeERR, htmlERR,
    dispatchByMIME, appsForMIME, Application(..), dispatchByApp,
    saveDownload, downloadToURI,
    -- logging API


@@ 13,16 13,20 @@ import Network.URI.Types

import qualified Data.Text as Txt
import           Data.Text (Text)
import qualified Data.Text.Encoding as Txt
import           Network.URI
import qualified Data.ByteString as Strict
import           Data.ByteString.Lazy (ByteString)
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Builder as Builder
import           Network.URI.Charset
import           Control.Exception
import           System.IO.Error (isEOFError)
import           Control.Concurrent.Async (forConcurrently)

import qualified Data.Maybe as M

-- for about: URIs & port parsing, all standard lib
import Data.Maybe (fromMaybe, listToMaybe, isJust)
import Data.Either (isLeft)


@@ 44,6 48,7 @@ import Data.List as L

#ifdef WITH_HTTP_URI
import qualified Network.HTTP.Client as HTTP
import qualified Network.HTTP.Client.MultipartFormData as HTTP
import qualified Network.HTTP.Client.OpenSSL as TLS
import qualified OpenSSL.Session as TLS
import           Network.HTTP.Types


@@ 51,6 56,7 @@ import           Data.List (intercalate)
import           Control.Concurrent (forkIO)

import           Network.URI.Cache
import           Network.URI.CookiesDB
#endif

#ifdef WITH_RAW_CONNECTIONS


@@ 132,12 138,10 @@ newSession' appname = do
    TLS.contextSetVerificationMode httpsCtxt $ TLS.VerifyPeer True True Nothing
    managerHTTP' <- HTTP.newManager $ TLS.opensslManagerSettings $ return httpsCtxt

    cookiesDir <- getXdgDirectory XdgData "nz.geek.adrian.hurl.cookies"
    cookiesDir <- getXdgDirectory XdgData "nz.geek.adrian.hurl.cookies2"
    let cookiesPath' = cookiesDir </> appname
    cookiesExist <- doesFileExist cookiesPath'
    cookies <- if cookiesExist then readMaybe <$> readFile cookiesPath' else return Nothing
    cookies' <- readCookies cookiesPath'
    now <- getCurrentTime
    let cookies' = HTTP.createCookieJar $ fromMaybe [] cookies
    cookieJar <- newMVar $ HTTP.evictExpiredCookies cookies' now
    cookieJar' <- newMVar $ HTTP.createCookieJar []
#endif


@@ 217,24 221,75 @@ mimeERR = "txt/x-error\t"
htmlERR = "html/x-error\t"

submitURL :: Session -> [String] -> URI -> Text -> String -> IO (URI, String, Either Text ByteString)
-- | See submitURL', preserved for backwards compatability.
-- This is a little more cumbersome to use, & doesn't support file uploads.
-- Was designed naively based on convenience of initial caller.
submitURL s a u m q =
    submitURL' s a u (Txt.encodeUtf8 m) "application/x-www-form-urlencoded" $
        Prelude.map parseQuery $ Txt.splitOn "&" $ Txt.pack q
  where
    parseQuery q = let (key, value) = Txt.breakOn "=" q in if Txt.null value
        then (decode key, Left "")
        else (decode key, Left $ decode $ Txt.tail value)
    decode = unEscapeString . Txt.unpack
-- | Uploads given key-value pairs to the specified URL using the specified HTTP method & encoding.
-- The key-value pairs may specify filepaths, in which case the method must be "POST"
-- and the encoding must be "multipart/form-data" for that data to get sent.
--
-- Unsupported encodings (values other than "application/x-www-form-urlencoded",
-- "text/plain", or "multipart/form-data") omits the key-value pairs from the request.
submitURL' :: Session -> [String] -> URI -> Strict.ByteString -> Strict.ByteString ->
    [(String, Either String FilePath)] -> IO (URI, String, Either Text ByteString)
#ifdef WITH_HTTP_URI
submitURL session accept uri "POST" query | uriScheme uri `elem` ["http:", "https:"] = do
addHTTPBody mime body req = return req {
    HTTP.requestHeaders = (hContentType, mime) :
        Prelude.filter (\(x, _) -> x /= hContentType) (HTTP.requestHeaders req),
    HTTP.requestBody = HTTP.RequestBodyBS $ C8.pack body
  }
packQuery :: [(String, Either String FilePath)] -> C8.ByteString -> HTTP.Request -> IO HTTP.Request
packQuery query mime@"application/x-www-form-urlencoded" =
    addHTTPBody mime $ encodeQuery query
packQuery query mime@"text/plain" = addHTTPBody mime $
    Prelude.unlines [value | (key, Left value) <- query, not $ null value]
packQuery q "multipart/form-data" = HTTP.formDataBody $ Prelude.map encodePart q
  where
    encodePart (key, Left value) = HTTP.partBS (Txt.pack key) (C8.pack value)
    encodePart (key, Right value) =
        -- C:\fakepath\ is part of the webstandards now & I might as well use it.
        -- Ancient browsers exposed the full filepath which was a security vulnerability.
        -- Now to avoid breaking servers relying on this behaviour we send
        -- a fake Windows filepath.
        HTTP.partFileRequestBodyM (Txt.pack key) ("C:\\fakepath\\" ++ takeFileName value) $ do
            size <- fromInteger <$> withBinaryFile value ReadMode hFileSize
            body <- B.readFile value
            return $ HTTP.RequestBodyBuilder size $ Builder.lazyByteString body
packQuery _ _ = return -- Do not upload data if requested to do so in an invalid format.
submitURL' session accept uri method encoding query | uriScheme uri `elem` ["http:", "https:"] = do
    -- HURL is very strict on when it allows cookies to be set: Only POST HTTP requests are considered consent.
    -- For the sake of most webframeworks' CSRF protection, cookies from retrieving the form are retroactively set.
    csrfCookies <- case retroactiveCookies session of {
        Just cookies -> Just <$> readMVar cookies;
        Nothing -> return Nothing
    }
    fetchHTTPCached session accept uri (\req -> req {
    fetchHTTPCached session False accept uri (\req -> do
        ret <- packQuery query encoding req
        return ret {
            HTTP.cookieJar = firstJust csrfCookies $ HTTP.cookieJar req,
            HTTP.method = "POST",
            HTTP.requestBody = HTTP.RequestBodyBS $ C8.pack query
        }) $ \resp -> do
            HTTP.method = method
        }) $ \resp -> if method /= "POST" then return () else do
            let cookies = HTTP.responseCookieJar resp
            putMVar (globalCookieJar session) cookies
            writeFile (cookiesPath session) $ show $ HTTP.destroyCookieJar cookies
            swapMVar (globalCookieJar session) cookies
            writeCookies (cookiesPath session) cookies False
#endif
submitURL session mimes uri _method query = fetchURL' session mimes uri { uriQuery = '?':query }
submitURL' session mimes uri _method _encoding query = fetchURL' session mimes uri {
    uriQuery = '?':encodeQuery query }
encodeQuery :: [(String, Either String FilePath)] -> String
encodeQuery query = intercalate "&" $ M.mapMaybe encodePart query
  where
    encodePart (key, Left "") = Just $ escape key
    encodePart (key, Left value) = Just (escape key ++ '=':escape value)
    encodePart _ = Nothing
    escape = escapeURIString isUnescapedInURIComponent

-- | As per `fetchURL`, but also returns the redirected URI.
fetchURL' :: Session -> [String] -> URI -> IO (URI, String, Either Text ByteString)


@@ 289,10 344,11 @@ fetchURL' Session {aboutPages = pages} _ url@URI {uriScheme = "about:", uriPath 

#ifdef WITH_HTTP_URI
fetchURL' session accept uri | uriScheme uri `elem` ["http:", "https:"] =
    fetchHTTPCached session accept uri id saveCookies
    fetchHTTPCached session (cachingEnabled session) accept uri return saveCookies
  where
    saveCookies resp
        | Just cookies <- retroactiveCookies session = putMVar cookies $ HTTP.responseCookieJar resp
        | Just cookies <- retroactiveCookies session =
            void $swapMVar cookies $HTTP.responseCookieJar resp
        | otherwise = return ()
#endif



@@ 395,14 451,14 @@ dispatchByApp _ _ _ _ = return False
#endif

#ifdef WITH_HTTP_URI
fetchHTTPCached session accept@(defaultMIME:_) uri cbReq cbResp = do
    cached <- if cachingEnabled session then readCacheHTTP uri else return (Nothing, Nothing)
fetchHTTPCached session shouldCache accept@(defaultMIME:_) uri cbReq cbResp = do
    cached <- if shouldCache then readCacheHTTP uri else return (Nothing, Nothing)
    response <- case cached of
        (Just (mime, body), Nothing) -> return $ Right (mime, body)
        (cached, cachingHeaders) -> do
            request <- HTTP.requestFromURI uri
            cookieJar <- readMVar $ globalCookieJar session
            let request' = cbReq $ request {
            request'<- cbReq request {
                HTTP.cookieJar = Just $ cookieJar,
                HTTP.requestHeaders = [
                    ("Accept", C8.pack $ intercalate ", " accept),


@@ 410,7 466,7 @@ fetchHTTPCached session accept@(defaultMIME:_) uri cbReq cbResp = do
                ] ++ fromMaybe [] cachingHeaders,
                HTTP.redirectCount = 0
            }
            response <- HTTP.httpLbs request $ managerHTTP session
            response <- HTTP.httpLbs request' $ managerHTTP session
            cbResp response
            case (
                    HTTP.responseStatus response,


@@ 442,7 498,7 @@ fetchHTTPCached session accept@(defaultMIME:_) uri cbReq cbResp = do
            let mime' = L.map (Txt.unpack . Txt.strip) $ Txt.splitOn ";" mime
            in return $ resolveCharset' uri mime' body
  `catch` \e -> do return (uri, mimeERR, Left $ Txt.pack $ trans (locale session) $ Http e)
fetchHTTPCached session [] uri _ _ =
fetchHTTPCached session _ [] uri _ _ =
    return (uri, mimeERR, Left $ Txt.pack $ trans (locale session) $ UnsupportedMIME "")
#endif