~alcinnz/hurl

ref: a0a3f383b82542bad1a519093e3670fdb3056ba4 hurl/src/Network/URI/Charset.hs -rw-r--r-- 2.5 KiB
a0a3f383 — Adrian Cochrane Don't crash on missing /usr/share/app-info/icons/ 4 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
{-# LANGUAGE OverloadedStrings #-}

-- | Handles server-specified text decoding.
module Network.URI.Charset(resolveCharset, resolveCharset', convertCharset, charsets) where
import           Data.Text (Text)
import           Data.ByteString.Lazy (ByteString)
import qualified Data.ByteString.Lazy as B
import           Data.Text.Encoding
import           Debug.Trace (trace)

-- | If the MIMEtype specifies a charset parameter, apply it.
resolveCharset :: [String] -- ^ The MIMEtype, split by ';'
    -> ByteString -- ^ The bytes received from the server
    -> (String, Either Text ByteString) -- ^ The MIMEtype (minus parameters) & possibly decoded text, to be returned from protocol handlers.
resolveCharset (mime:('c':'h':'a':'r':'s':'e':'t':'=':charset):_) response =
    (mime, Left $ convertCharset charset $ B.toStrict response)
resolveCharset (mime:_:params) response = resolveCharset (mime:params) response
resolveCharset [mime] response = (mime, Right $ response)
-- NOTE I can't localize this error string because resolveCharset doesn't know the locale.
--      I don't think this is worth fixing, because hitting this indicates the server is badly misbehaving.
resolveCharset [] response = ("text/x-error\t", Left "Filetype unspecified")

-- | As per `resolveCharset`, but also returns given URI (or other type).
resolveCharset' :: a -> [String] -> ByteString -> (a, String, Either Text ByteString)
resolveCharset' a mimes resp = let (mime, resp') = resolveCharset mimes resp in (a, mime, resp')

-- | Decodes bytes according to a charset identified by it's IANA-assigned name(s).
convertCharset "iso-8859-1" = decodeLatin1
convertCharset "latin1" = decodeLatin1
convertCharset "us-ascii" = decodeUtf8With replaceChar
convertCharset "utf-8" = decodeUtf8With replaceChar
convertCharset "utf-16be" = decodeUtf16BEWith replaceChar
convertCharset "utf-16le" = decodeUtf16LEWith replaceChar
convertCharset "utf-16" = decodeUtf16LEWith replaceChar
convertCharset "utf-32be" = decodeUtf32BEWith replaceChar
convertCharset "utf-32le" = decodeUtf32LEWith replaceChar
convertCharset "utf-32" = decodeUtf32LEWith replaceChar
convertCharset charset = -- FIXME Is this the best fallback for unsupported charsets?
    trace ("Unsupported text encoding" ++ charset) $ decodeUtf8With replaceChar

replaceChar _ _ = Just '�'

-- | Lists all charsets supported by convertCharset
charsets :: [Text]
charsets = ["iso-8859-1", "latin1", "us-ascii", "utf-8", "utf-16be", "utf-16le", "utf-16", "utf-32be", "utf-32le", "utf-32"]