~jaro/balkon

7364410dd4b2c07e049b86f99732262160d1db70 — Jaro 1 year, 9 months ago 2613eb4
Properly use ICU locale identifiers.
M CHANGELOG.md => CHANGELOG.md +5 -0
@@ 1,5 1,10 @@
# Revision history for Balkón

## 0.1.0.1 -- TBD

* Internally, language tags will be cut at the first invalid character before
  being passed to ICU.

## 0.1.0.0 -- 2023-03-13

* Text shaping using HarfBuzz.

M src/Data/Text/ParagraphLayout/Internal/Break.hs => src/Data/Text/ParagraphLayout/Internal/Break.hs +39 -2
@@ 5,12 5,49 @@
-- items (also called UTF-8 code units or bytes) between the start of the input
-- `Text` and the position of the break. The internal offset of the `Text` from
-- the start of its underlying byte array is excluded.
module Data.Text.ParagraphLayout.Internal.Break (breaksDesc, subOffsetsDesc)
module Data.Text.ParagraphLayout.Internal.Break
    (LineBreak(..)
    ,locale
    ,breaksDesc
    ,subOffsetsDesc
    )
where

import Data.Text (Text)
import Data.Text.Foreign (lengthWord8)
import Data.Text.ICU (Break, Breaker, breaksRight, brkPrefix, brkStatus)
import Data.Text.ICU
    (Break
    ,Breaker
    ,LocaleName(Locale)
    ,breaksRight
    ,brkPrefix
    ,brkStatus
    )

-- | Strictness levels of line-breaking rules,
-- corresponding to the CSS @line-break@ property.
data LineBreak = LBAuto | LBLoose | LBNormal | LBStrict

-- | Line breaking keyword to use in an ICU locale identifier.
lbKeyword :: LineBreak -> String
lbKeyword LBAuto = ""
lbKeyword LBLoose = "@lb=loose"
lbKeyword LBNormal = "@lb=normal"
lbKeyword LBStrict = "@lb=strict"

-- | An ICU locale identifier corresponding to the given IETF BCP 47 language
-- tag and line breaking strictness.
--
-- For line breaking, the differences are mostly in the strictness of breaking
-- Chinese and Japanese text.
locale :: String -> LineBreak -> LocaleName
locale lang lb = Locale $ (clean lang) ++ (lbKeyword lb)
    where
        -- ICU's "level 1 canonicalisation" can handle most BCP 47 tags,
        -- including case changes and converting hyphens to underscores.
        --
        -- This filter is here just to stop syntactically incorrect input.
        clean = takeWhile (`elem` ['A'..'Z'] ++ ['a'..'z'] ++ "_-")

-- | List of all breaks in the given text, with offsets in descending order,
-- including the status of the break if applicable.

M src/Data/Text/ParagraphLayout/Internal/Plain.hs => src/Data/Text/ParagraphLayout/Internal/Plain.hs +2 -7
@@ 24,7 24,7 @@ import Data.Text.Glyphize
    ,fontExtentsForDir
    ,shape
    )
import Data.Text.ICU (Breaker, LocaleName(Locale), breakCharacter, breakLine)
import Data.Text.ICU (Breaker, LocaleName, breakCharacter, breakLine)
import qualified Data.Text.ICU as BreakStatus (Line)
import Data.Text.Internal (Text(Text))
import qualified Data.Text.Lazy as Lazy


@@ 238,7 238,7 @@ resolveSpans p@(Paragraph arr pStart spans pOpts) = do
paragraphBreaks :: (LocaleName -> Breaker a) -> Paragraph -> Int -> String ->
    [(Int, a)]
paragraphBreaks breakFunc (Paragraph arr off _ _) end lang =
    breaksDesc (breakFunc (localeFromLanguage lang)) paragraphText
    breaksDesc (breakFunc (locale lang LBAuto)) paragraphText
    where
        paragraphText = Text arr off (end - off)



@@ 262,8 262,3 @@ runBreaksFromSpan run spanBreaks =
-- a line according to the CSS Text Module.
isEndSpace :: Char -> Bool
isEndSpace c = c `elem` [' ', '\t', '\x1680']

-- TODO: Convert from IETF BCP 47 language tag to ICU locale identifier,
--       possibly with an algorithm to find the best matching available locale.
localeFromLanguage :: String -> LocaleName
localeFromLanguage x = Locale $ map (\c -> if c == '-' then '_' else c) x

M src/Data/Text/ParagraphLayout/Internal/Span.hs => src/Data/Text/ParagraphLayout/Internal/Span.hs +5 -2
@@ 31,8 31,11 @@ data SpanOptions = SpanOptions

    { spanLanguage :: String
    -- ^ IETF BCP 47 language tag, such as the value expected to be found in
    -- the HTML @lang@ attribute.
    -- Used for selecting the appropriate glyphs and line breaking rules.
    -- the HTML @lang@ attribute, specifying the primary language for the
    -- span's text content. An empty string explicitly means "language unknown".
    --
    -- Used for selecting the appropriate glyphs and line breaking rules,
    -- primarily in East Asian languages.

    }
    deriving (Eq, Read, Show)

M test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs => test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs +51 -0
@@ 1,5 1,6 @@
module Data.Text.ParagraphLayout.Internal.BreakSpec (spec) where

import Control.Monad (forM_)
import Data.Text (empty, pack, singleton)
import Data.Text.ICU
    (LocaleName(Locale)


@@ 64,6 65,56 @@ spec = do
                    ,(0, BreakStatus.Soft)
                    ]

            let jaText = pack "五ヶ月‡コード"
            let jaBreaksStrict =
                    [(18, BreakStatus.Soft)
                    ,(12, BreakStatus.Soft)
                    ,(9, BreakStatus.Soft)
                    ,(6, BreakStatus.Soft)
                    ,(0, BreakStatus.Soft)
                    ]
            let jaBreaksLoose =
                    [(18, BreakStatus.Soft)
                    ,(15, BreakStatus.Soft)
                    ,(12, BreakStatus.Soft)
                    ,(9, BreakStatus.Soft)
                    ,(6, BreakStatus.Soft)
                    ,(3, BreakStatus.Soft)
                    ,(0, BreakStatus.Soft)
                    ]

            -- Observed behaviour.
            -- Not sure why Chinese rules are stricter for Japanese text.
            -- This behaviour may change with future versions of ICU.
            let expectedStrictLocales =
                    [""
                    ,"en"
                    ,"ja@lb=strict"
                    ,"zh"
                    ,"zh_Hans"
                    ,"zh_Hant"
                    ,"zxx"
                    ,"zxx-any-invalid-suffix"
                    ]
            let expectedLooseLocales =
                    ["@lb=loose"
                    ,"en@lb=loose"
                    ,"ja"
                    ,"ja_JP"
                    ,"ja-JP"
                    ,"ja-any-invalid-suffix"
                    ,"zh@lb=loose"
                    ,"zxx-any-invalid-suffix@lb=loose"
                    ]

            expectedStrictLocales `forM_` \l ->
                it ("uses strict line breaks for " ++ l ++ " locale") $
                    b l jaText `shouldBe` jaBreaksStrict

            expectedLooseLocales `forM_` \l ->
                it ("uses loose line breaks for " ++ l ++ " locale") $
                    b l jaText `shouldBe` jaBreaksLoose

        -- Probably not useful for a web browser rendering engine.
        describe "breakSentence" $ do
            let b lang = breaksDesc $ breakSentence (Locale lang)