~jaro/balkon: Properly use ICU locale identifiers.

5 files changed, 102 insertions(+), 11 deletions(-)

M CHANGELOG.md
M src/Data/Text/ParagraphLayout/Internal/Break.hs
M src/Data/Text/ParagraphLayout/Internal/Plain.hs
M src/Data/Text/ParagraphLayout/Internal/Span.hs
M test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs

M CHANGELOG.md => CHANGELOG.md +5 -0

@@ 1,5 1,10 @@
 # Revision history for Balkón
 
+## 0.1.0.1 -- TBD
+
+* Internally, language tags will be cut at the first invalid character before
+  being passed to ICU.
+
 ## 0.1.0.0 -- 2023-03-13
 
 * Text shaping using HarfBuzz.

M src/Data/Text/ParagraphLayout/Internal/Break.hs => src/Data/Text/ParagraphLayout/Internal/Break.hs +39 -2

@@ 5,12 5,49 @@
 -- items (also called UTF-8 code units or bytes) between the start of the input
 -- `Text` and the position of the break. The internal offset of the `Text` from
 -- the start of its underlying byte array is excluded.
-module Data.Text.ParagraphLayout.Internal.Break (breaksDesc, subOffsetsDesc)
+module Data.Text.ParagraphLayout.Internal.Break
+    (LineBreak(..)
+    ,locale
+    ,breaksDesc
+    ,subOffsetsDesc
+    )
 where
 
 import Data.Text (Text)
 import Data.Text.Foreign (lengthWord8)
-import Data.Text.ICU (Break, Breaker, breaksRight, brkPrefix, brkStatus)
+import Data.Text.ICU
+    (Break
+    ,Breaker
+    ,LocaleName(Locale)
+    ,breaksRight
+    ,brkPrefix
+    ,brkStatus
+    )
+
+-- | Strictness levels of line-breaking rules,
+-- corresponding to the CSS @line-break@ property.
+data LineBreak = LBAuto | LBLoose | LBNormal | LBStrict
+
+-- | Line breaking keyword to use in an ICU locale identifier.
+lbKeyword :: LineBreak -> String
+lbKeyword LBAuto = ""
+lbKeyword LBLoose = "@lb=loose"
+lbKeyword LBNormal = "@lb=normal"
+lbKeyword LBStrict = "@lb=strict"
+
+-- | An ICU locale identifier corresponding to the given IETF BCP 47 language
+-- tag and line breaking strictness.
+--
+-- For line breaking, the differences are mostly in the strictness of breaking
+-- Chinese and Japanese text.
+locale :: String -> LineBreak -> LocaleName
+locale lang lb = Locale $ (clean lang) ++ (lbKeyword lb)
+    where
+        -- ICU's "level 1 canonicalisation" can handle most BCP 47 tags,
+        -- including case changes and converting hyphens to underscores.
+        --
+        -- This filter is here just to stop syntactically incorrect input.
+        clean = takeWhile (`elem` ['A'..'Z'] ++ ['a'..'z'] ++ "_-")
 
 -- | List of all breaks in the given text, with offsets in descending order,
 -- including the status of the break if applicable.

M src/Data/Text/ParagraphLayout/Internal/Plain.hs => src/Data/Text/ParagraphLayout/Internal/Plain.hs +2 -7

@@ 24,7 24,7 @@ import Data.Text.Glyphize
     ,fontExtentsForDir
     ,shape
     )
-import Data.Text.ICU (Breaker, LocaleName(Locale), breakCharacter, breakLine)
+import Data.Text.ICU (Breaker, LocaleName, breakCharacter, breakLine)
 import qualified Data.Text.ICU as BreakStatus (Line)
 import Data.Text.Internal (Text(Text))
 import qualified Data.Text.Lazy as Lazy


@@ 238,7 238,7 @@ resolveSpans p@(Paragraph arr pStart spans pOpts) = do
 paragraphBreaks :: (LocaleName -> Breaker a) -> Paragraph -> Int -> String ->
     [(Int, a)]
 paragraphBreaks breakFunc (Paragraph arr off _ _) end lang =
-    breaksDesc (breakFunc (localeFromLanguage lang)) paragraphText
+    breaksDesc (breakFunc (locale lang LBAuto)) paragraphText
     where
         paragraphText = Text arr off (end - off)
 


@@ 262,8 262,3 @@ runBreaksFromSpan run spanBreaks =
 -- a line according to the CSS Text Module.
 isEndSpace :: Char -> Bool
 isEndSpace c = c `elem` [' ', '\t', '\x1680']
-
--- TODO: Convert from IETF BCP 47 language tag to ICU locale identifier,
---       possibly with an algorithm to find the best matching available locale.
-localeFromLanguage :: String -> LocaleName
-localeFromLanguage x = Locale $ map (\c -> if c == '-' then '_' else c) x

M src/Data/Text/ParagraphLayout/Internal/Span.hs => src/Data/Text/ParagraphLayout/Internal/Span.hs +5 -2

@@ 31,8 31,11 @@ data SpanOptions = SpanOptions
 
     { spanLanguage :: String
     -- ^ IETF BCP 47 language tag, such as the value expected to be found in
-    -- the HTML @lang@ attribute.
-    -- Used for selecting the appropriate glyphs and line breaking rules.
+    -- the HTML @lang@ attribute, specifying the primary language for the
+    -- span's text content. An empty string explicitly means "language unknown".
+    --
+    -- Used for selecting the appropriate glyphs and line breaking rules,
+    -- primarily in East Asian languages.
 
     }
     deriving (Eq, Read, Show)

M test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs => test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs +51 -0

@@ 1,5 1,6 @@
 module Data.Text.ParagraphLayout.Internal.BreakSpec (spec) where
 
+import Control.Monad (forM_)
 import Data.Text (empty, pack, singleton)
 import Data.Text.ICU
     (LocaleName(Locale)


@@ 64,6 65,56 @@ spec = do
                     ,(0, BreakStatus.Soft)
                     ]
 
+            let jaText = pack "五ヶ月‡コード"
+            let jaBreaksStrict =
+                    [(18, BreakStatus.Soft)
+                    ,(12, BreakStatus.Soft)
+                    ,(9, BreakStatus.Soft)
+                    ,(6, BreakStatus.Soft)
+                    ,(0, BreakStatus.Soft)
+                    ]
+            let jaBreaksLoose =
+                    [(18, BreakStatus.Soft)
+                    ,(15, BreakStatus.Soft)
+                    ,(12, BreakStatus.Soft)
+                    ,(9, BreakStatus.Soft)
+                    ,(6, BreakStatus.Soft)
+                    ,(3, BreakStatus.Soft)
+                    ,(0, BreakStatus.Soft)
+                    ]
+
+            -- Observed behaviour.
+            -- Not sure why Chinese rules are stricter for Japanese text.
+            -- This behaviour may change with future versions of ICU.
+            let expectedStrictLocales =
+                    [""
+                    ,"en"
+                    ,"ja@lb=strict"
+                    ,"zh"
+                    ,"zh_Hans"
+                    ,"zh_Hant"
+                    ,"zxx"
+                    ,"zxx-any-invalid-suffix"
+                    ]
+            let expectedLooseLocales =
+                    ["@lb=loose"
+                    ,"en@lb=loose"
+                    ,"ja"
+                    ,"ja_JP"
+                    ,"ja-JP"
+                    ,"ja-any-invalid-suffix"
+                    ,"zh@lb=loose"
+                    ,"zxx-any-invalid-suffix@lb=loose"
+                    ]
+
+            expectedStrictLocales `forM_` \l ->
+                it ("uses strict line breaks for " ++ l ++ " locale") $
+                    b l jaText `shouldBe` jaBreaksStrict
+
+            expectedLooseLocales `forM_` \l ->
+                it ("uses loose line breaks for " ++ l ++ " locale") $
+                    b l jaText `shouldBe` jaBreaksLoose
+
         -- Probably not useful for a web browser rendering engine.
         describe "breakSentence" $ do
             let b lang = breaksDesc $ breakSentence (Locale lang)