From 7364410dd4b2c07e049b86f99732262160d1db70 Mon Sep 17 00:00:00 2001 From: Jaro Date: Tue, 14 Mar 2023 19:26:50 +0100 Subject: [PATCH] Properly use ICU locale identifiers. --- CHANGELOG.md | 5 ++ .../Text/ParagraphLayout/Internal/Break.hs | 41 ++++++++++++++- .../Text/ParagraphLayout/Internal/Plain.hs | 9 +--- .../Text/ParagraphLayout/Internal/Span.hs | 7 ++- .../ParagraphLayout/Internal/BreakSpec.hs | 51 +++++++++++++++++++ 5 files changed, 102 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 790f17d..7e0b68b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Revision history for Balkón +## 0.1.0.1 -- TBD + +* Internally, language tags will be cut at the first invalid character before + being passed to ICU. + ## 0.1.0.0 -- 2023-03-13 * Text shaping using HarfBuzz. diff --git a/src/Data/Text/ParagraphLayout/Internal/Break.hs b/src/Data/Text/ParagraphLayout/Internal/Break.hs index 35a1a40..c5d00fe 100644 --- a/src/Data/Text/ParagraphLayout/Internal/Break.hs +++ b/src/Data/Text/ParagraphLayout/Internal/Break.hs @@ -5,12 +5,49 @@ -- items (also called UTF-8 code units or bytes) between the start of the input -- `Text` and the position of the break. The internal offset of the `Text` from -- the start of its underlying byte array is excluded. -module Data.Text.ParagraphLayout.Internal.Break (breaksDesc, subOffsetsDesc) +module Data.Text.ParagraphLayout.Internal.Break + (LineBreak(..) + ,locale + ,breaksDesc + ,subOffsetsDesc + ) where import Data.Text (Text) import Data.Text.Foreign (lengthWord8) -import Data.Text.ICU (Break, Breaker, breaksRight, brkPrefix, brkStatus) +import Data.Text.ICU + (Break + ,Breaker + ,LocaleName(Locale) + ,breaksRight + ,brkPrefix + ,brkStatus + ) + +-- | Strictness levels of line-breaking rules, +-- corresponding to the CSS @line-break@ property. +data LineBreak = LBAuto | LBLoose | LBNormal | LBStrict + +-- | Line breaking keyword to use in an ICU locale identifier. +lbKeyword :: LineBreak -> String +lbKeyword LBAuto = "" +lbKeyword LBLoose = "@lb=loose" +lbKeyword LBNormal = "@lb=normal" +lbKeyword LBStrict = "@lb=strict" + +-- | An ICU locale identifier corresponding to the given IETF BCP 47 language +-- tag and line breaking strictness. +-- +-- For line breaking, the differences are mostly in the strictness of breaking +-- Chinese and Japanese text. +locale :: String -> LineBreak -> LocaleName +locale lang lb = Locale $ (clean lang) ++ (lbKeyword lb) + where + -- ICU's "level 1 canonicalisation" can handle most BCP 47 tags, + -- including case changes and converting hyphens to underscores. + -- + -- This filter is here just to stop syntactically incorrect input. + clean = takeWhile (`elem` ['A'..'Z'] ++ ['a'..'z'] ++ "_-") -- | List of all breaks in the given text, with offsets in descending order, -- including the status of the break if applicable. diff --git a/src/Data/Text/ParagraphLayout/Internal/Plain.hs b/src/Data/Text/ParagraphLayout/Internal/Plain.hs index 1a488f2..d06d74f 100644 --- a/src/Data/Text/ParagraphLayout/Internal/Plain.hs +++ b/src/Data/Text/ParagraphLayout/Internal/Plain.hs @@ -24,7 +24,7 @@ import Data.Text.Glyphize ,fontExtentsForDir ,shape ) -import Data.Text.ICU (Breaker, LocaleName(Locale), breakCharacter, breakLine) +import Data.Text.ICU (Breaker, LocaleName, breakCharacter, breakLine) import qualified Data.Text.ICU as BreakStatus (Line) import Data.Text.Internal (Text(Text)) import qualified Data.Text.Lazy as Lazy @@ -238,7 +238,7 @@ resolveSpans p@(Paragraph arr pStart spans pOpts) = do paragraphBreaks :: (LocaleName -> Breaker a) -> Paragraph -> Int -> String -> [(Int, a)] paragraphBreaks breakFunc (Paragraph arr off _ _) end lang = - breaksDesc (breakFunc (localeFromLanguage lang)) paragraphText + breaksDesc (breakFunc (locale lang LBAuto)) paragraphText where paragraphText = Text arr off (end - off) @@ -262,8 +262,3 @@ runBreaksFromSpan run spanBreaks = -- a line according to the CSS Text Module. isEndSpace :: Char -> Bool isEndSpace c = c `elem` [' ', '\t', '\x1680'] - --- TODO: Convert from IETF BCP 47 language tag to ICU locale identifier, --- possibly with an algorithm to find the best matching available locale. -localeFromLanguage :: String -> LocaleName -localeFromLanguage x = Locale $ map (\c -> if c == '-' then '_' else c) x diff --git a/src/Data/Text/ParagraphLayout/Internal/Span.hs b/src/Data/Text/ParagraphLayout/Internal/Span.hs index 8399f69..b8750b1 100644 --- a/src/Data/Text/ParagraphLayout/Internal/Span.hs +++ b/src/Data/Text/ParagraphLayout/Internal/Span.hs @@ -31,8 +31,11 @@ data SpanOptions = SpanOptions { spanLanguage :: String -- ^ IETF BCP 47 language tag, such as the value expected to be found in - -- the HTML @lang@ attribute. - -- Used for selecting the appropriate glyphs and line breaking rules. + -- the HTML @lang@ attribute, specifying the primary language for the + -- span's text content. An empty string explicitly means "language unknown". + -- + -- Used for selecting the appropriate glyphs and line breaking rules, + -- primarily in East Asian languages. } deriving (Eq, Read, Show) diff --git a/test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs b/test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs index 544a03f..37d9b06 100644 --- a/test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs +++ b/test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs @@ -1,5 +1,6 @@ module Data.Text.ParagraphLayout.Internal.BreakSpec (spec) where +import Control.Monad (forM_) import Data.Text (empty, pack, singleton) import Data.Text.ICU (LocaleName(Locale) @@ -64,6 +65,56 @@ spec = do ,(0, BreakStatus.Soft) ] + let jaText = pack "五ヶ月‡コード" + let jaBreaksStrict = + [(18, BreakStatus.Soft) + ,(12, BreakStatus.Soft) + ,(9, BreakStatus.Soft) + ,(6, BreakStatus.Soft) + ,(0, BreakStatus.Soft) + ] + let jaBreaksLoose = + [(18, BreakStatus.Soft) + ,(15, BreakStatus.Soft) + ,(12, BreakStatus.Soft) + ,(9, BreakStatus.Soft) + ,(6, BreakStatus.Soft) + ,(3, BreakStatus.Soft) + ,(0, BreakStatus.Soft) + ] + + -- Observed behaviour. + -- Not sure why Chinese rules are stricter for Japanese text. + -- This behaviour may change with future versions of ICU. + let expectedStrictLocales = + ["" + ,"en" + ,"ja@lb=strict" + ,"zh" + ,"zh_Hans" + ,"zh_Hant" + ,"zxx" + ,"zxx-any-invalid-suffix" + ] + let expectedLooseLocales = + ["@lb=loose" + ,"en@lb=loose" + ,"ja" + ,"ja_JP" + ,"ja-JP" + ,"ja-any-invalid-suffix" + ,"zh@lb=loose" + ,"zxx-any-invalid-suffix@lb=loose" + ] + + expectedStrictLocales `forM_` \l -> + it ("uses strict line breaks for " ++ l ++ " locale") $ + b l jaText `shouldBe` jaBreaksStrict + + expectedLooseLocales `forM_` \l -> + it ("uses loose line breaks for " ++ l ++ " locale") $ + b l jaText `shouldBe` jaBreaksLoose + -- Probably not useful for a web browser rendering engine. describe "breakSentence" $ do let b lang = breaksDesc $ breakSentence (Locale lang) -- 2.30.2