M CHANGELOG.md => CHANGELOG.md +5 -0
@@ 1,5 1,10 @@
# Revision history for Balkón
+## 0.1.0.1 -- TBD
+
+* Internally, language tags will be cut at the first invalid character before
+ being passed to ICU.
+
## 0.1.0.0 -- 2023-03-13
* Text shaping using HarfBuzz.
M src/Data/Text/ParagraphLayout/Internal/Break.hs => src/Data/Text/ParagraphLayout/Internal/Break.hs +39 -2
@@ 5,12 5,49 @@
-- items (also called UTF-8 code units or bytes) between the start of the input
-- `Text` and the position of the break. The internal offset of the `Text` from
-- the start of its underlying byte array is excluded.
-module Data.Text.ParagraphLayout.Internal.Break (breaksDesc, subOffsetsDesc)
+module Data.Text.ParagraphLayout.Internal.Break
+ (LineBreak(..)
+ ,locale
+ ,breaksDesc
+ ,subOffsetsDesc
+ )
where
import Data.Text (Text)
import Data.Text.Foreign (lengthWord8)
-import Data.Text.ICU (Break, Breaker, breaksRight, brkPrefix, brkStatus)
+import Data.Text.ICU
+ (Break
+ ,Breaker
+ ,LocaleName(Locale)
+ ,breaksRight
+ ,brkPrefix
+ ,brkStatus
+ )
+
+-- | Strictness levels of line-breaking rules,
+-- corresponding to the CSS @line-break@ property.
+data LineBreak = LBAuto | LBLoose | LBNormal | LBStrict
+
+-- | Line breaking keyword to use in an ICU locale identifier.
+lbKeyword :: LineBreak -> String
+lbKeyword LBAuto = ""
+lbKeyword LBLoose = "@lb=loose"
+lbKeyword LBNormal = "@lb=normal"
+lbKeyword LBStrict = "@lb=strict"
+
+-- | An ICU locale identifier corresponding to the given IETF BCP 47 language
+-- tag and line breaking strictness.
+--
+-- For line breaking, the differences are mostly in the strictness of breaking
+-- Chinese and Japanese text.
+locale :: String -> LineBreak -> LocaleName
+locale lang lb = Locale $ (clean lang) ++ (lbKeyword lb)
+ where
+ -- ICU's "level 1 canonicalisation" can handle most BCP 47 tags,
+ -- including case changes and converting hyphens to underscores.
+ --
+ -- This filter is here just to stop syntactically incorrect input.
+ clean = takeWhile (`elem` ['A'..'Z'] ++ ['a'..'z'] ++ "_-")
-- | List of all breaks in the given text, with offsets in descending order,
-- including the status of the break if applicable.
M src/Data/Text/ParagraphLayout/Internal/Plain.hs => src/Data/Text/ParagraphLayout/Internal/Plain.hs +2 -7
@@ 24,7 24,7 @@ import Data.Text.Glyphize
,fontExtentsForDir
,shape
)
-import Data.Text.ICU (Breaker, LocaleName(Locale), breakCharacter, breakLine)
+import Data.Text.ICU (Breaker, LocaleName, breakCharacter, breakLine)
import qualified Data.Text.ICU as BreakStatus (Line)
import Data.Text.Internal (Text(Text))
import qualified Data.Text.Lazy as Lazy
@@ 238,7 238,7 @@ resolveSpans p@(Paragraph arr pStart spans pOpts) = do
paragraphBreaks :: (LocaleName -> Breaker a) -> Paragraph -> Int -> String ->
[(Int, a)]
paragraphBreaks breakFunc (Paragraph arr off _ _) end lang =
- breaksDesc (breakFunc (localeFromLanguage lang)) paragraphText
+ breaksDesc (breakFunc (locale lang LBAuto)) paragraphText
where
paragraphText = Text arr off (end - off)
@@ 262,8 262,3 @@ runBreaksFromSpan run spanBreaks =
-- a line according to the CSS Text Module.
isEndSpace :: Char -> Bool
isEndSpace c = c `elem` [' ', '\t', '\x1680']
-
--- TODO: Convert from IETF BCP 47 language tag to ICU locale identifier,
--- possibly with an algorithm to find the best matching available locale.
-localeFromLanguage :: String -> LocaleName
-localeFromLanguage x = Locale $ map (\c -> if c == '-' then '_' else c) x
M src/Data/Text/ParagraphLayout/Internal/Span.hs => src/Data/Text/ParagraphLayout/Internal/Span.hs +5 -2
@@ 31,8 31,11 @@ data SpanOptions = SpanOptions
{ spanLanguage :: String
-- ^ IETF BCP 47 language tag, such as the value expected to be found in
- -- the HTML @lang@ attribute.
- -- Used for selecting the appropriate glyphs and line breaking rules.
+ -- the HTML @lang@ attribute, specifying the primary language for the
+ -- span's text content. An empty string explicitly means "language unknown".
+ --
+ -- Used for selecting the appropriate glyphs and line breaking rules,
+ -- primarily in East Asian languages.
}
deriving (Eq, Read, Show)
M test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs => test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs +51 -0
@@ 1,5 1,6 @@
module Data.Text.ParagraphLayout.Internal.BreakSpec (spec) where
+import Control.Monad (forM_)
import Data.Text (empty, pack, singleton)
import Data.Text.ICU
(LocaleName(Locale)
@@ 64,6 65,56 @@ spec = do
,(0, BreakStatus.Soft)
]
+ let jaText = pack "五ヶ月‡コード"
+ let jaBreaksStrict =
+ [(18, BreakStatus.Soft)
+ ,(12, BreakStatus.Soft)
+ ,(9, BreakStatus.Soft)
+ ,(6, BreakStatus.Soft)
+ ,(0, BreakStatus.Soft)
+ ]
+ let jaBreaksLoose =
+ [(18, BreakStatus.Soft)
+ ,(15, BreakStatus.Soft)
+ ,(12, BreakStatus.Soft)
+ ,(9, BreakStatus.Soft)
+ ,(6, BreakStatus.Soft)
+ ,(3, BreakStatus.Soft)
+ ,(0, BreakStatus.Soft)
+ ]
+
+ -- Observed behaviour.
+ -- Not sure why Chinese rules are stricter for Japanese text.
+ -- This behaviour may change with future versions of ICU.
+ let expectedStrictLocales =
+ [""
+ ,"en"
+ ,"ja@lb=strict"
+ ,"zh"
+ ,"zh_Hans"
+ ,"zh_Hant"
+ ,"zxx"
+ ,"zxx-any-invalid-suffix"
+ ]
+ let expectedLooseLocales =
+ ["@lb=loose"
+ ,"en@lb=loose"
+ ,"ja"
+ ,"ja_JP"
+ ,"ja-JP"
+ ,"ja-any-invalid-suffix"
+ ,"zh@lb=loose"
+ ,"zxx-any-invalid-suffix@lb=loose"
+ ]
+
+ expectedStrictLocales `forM_` \l ->
+ it ("uses strict line breaks for " ++ l ++ " locale") $
+ b l jaText `shouldBe` jaBreaksStrict
+
+ expectedLooseLocales `forM_` \l ->
+ it ("uses loose line breaks for " ++ l ++ " locale") $
+ b l jaText `shouldBe` jaBreaksLoose
+
-- Probably not useful for a web browser rendering engine.
describe "breakSentence" $ do
let b lang = breaksDesc $ breakSentence (Locale lang)