-- | Boundary analysis using `Data.Text.ICU`, but returning numeric offsets -- instead of text slices. -- -- Within this module, each /offset/ refers to the number of `Data.Word.Word8` -- items (also called UTF-8 code units or bytes) between the start of the input -- `Text` and the position of the break. The internal offset of the `Text` from -- the start of its underlying byte array is excluded. module Data.Text.ParagraphLayout.Internal.Break (LineBreak(..) ,locale ,breaksDesc ,subOffsetsDesc ) where import Data.Text (Text) import Data.Text.Foreign (lengthWord8) import Data.Text.ICU (Break ,Breaker ,LocaleName(Locale) ,breaksRight ,brkPrefix ,brkStatus ) -- | Strictness levels of line-breaking rules, -- corresponding to the CSS @line-break@ property. data LineBreak = LBAuto | LBLoose | LBNormal | LBStrict -- | Line breaking keyword to use in an ICU locale identifier. lbKeyword :: LineBreak -> String lbKeyword LBAuto = "" lbKeyword LBLoose = "@lb=loose" lbKeyword LBNormal = "@lb=normal" lbKeyword LBStrict = "@lb=strict" -- | An ICU locale identifier corresponding to the given IETF BCP 47 language -- tag and line breaking strictness. -- -- For line breaking, the differences are mostly in the strictness of breaking -- Chinese and Japanese text. locale :: String -> LineBreak -> LocaleName locale lang lb = Locale $ (clean lang) ++ (lbKeyword lb) where -- ICU's "level 1 canonicalisation" can handle most BCP 47 tags, -- including case changes and converting hyphens to underscores. -- -- This filter is here just to stop syntactically incorrect input. clean = takeWhile (`elem` ['A'..'Z'] ++ ['a'..'z'] ++ "_-") -- | List of all breaks in the given text, with offsets in descending order, -- including the status of the break if applicable. -- -- Includes the start of the text (with offset 0) as the last list item. -- -- Excludes the end of the text (with offset equal to the text length). breaksDesc :: Breaker a -> Text -> [(Int, a)] breaksDesc breaker input = map brkStartOffsetStatus $ breaksRight breaker input brkStartOffsetStatus :: Break a -> (Int, a) brkStartOffsetStatus brk = (brkStartOffset brk, brkStatus brk) -- | The ICU library returns "breaks" as slices of text with two boundaries. -- This gives the smaller of the two distances from the start of the text -- to the boundaries of the break. brkStartOffset :: Break a -> Int brkStartOffset brk = lengthWord8 (brkPrefix brk) -- | Given a list of offsets into a text in descending order, produce a list of -- corresponding offsets into a slice of the text starting at a given offset. subOffsetsDesc :: Int -> [(Int, a)] -> [(Int, a)] subOffsetsDesc d = takeWhile valid . map adjust where valid (off, _) = off >= 0 adjust (off, a) = (off - d, a)