~jaro/balkon: src/Data/Text/ParagraphLayout/Internal/Break.hs

-- | Boundary analysis using `Data.Text.ICU`, but returning numeric offsets
-- instead of text slices.
--
-- Within this module, each /offset/ refers to the number of `Data.Word.Word8`
-- items (also called UTF-8 code units or bytes) between the start of the input
-- `Text` and the position of the break. The internal offset of the `Text` from
-- the start of its underlying byte array is excluded.
module Data.Text.ParagraphLayout.Internal.Break
    (LineBreak(..)
    ,locale
    ,breaksDesc
    ,subOffsetsDesc
    )
where

import Data.Text (Text)
import Data.Text.Foreign (lengthWord8)
import Data.Text.ICU
    (Break
    ,Breaker
    ,LocaleName(Locale)
    ,breaksRight
    ,brkPrefix
    ,brkStatus
    )

-- | Strictness levels of line-breaking rules,
-- corresponding to the CSS @line-break@ property.
data LineBreak = LBAuto | LBLoose | LBNormal | LBStrict

-- | Line breaking keyword to use in an ICU locale identifier.
lbKeyword :: LineBreak -> String
lbKeyword LBAuto = ""
lbKeyword LBLoose = "@lb=loose"
lbKeyword LBNormal = "@lb=normal"
lbKeyword LBStrict = "@lb=strict"

-- | An ICU locale identifier corresponding to the given IETF BCP 47 language
-- tag and line breaking strictness.
--
-- For line breaking, the differences are mostly in the strictness of breaking
-- Chinese and Japanese text.
locale :: String -> LineBreak -> LocaleName
locale lang lb = Locale $ (clean lang) ++ (lbKeyword lb)
    where
        -- ICU's "level 1 canonicalisation" can handle most BCP 47 tags,
        -- including case changes and converting hyphens to underscores.
        --
        -- This filter is here just to stop syntactically incorrect input.
        clean = takeWhile (`elem` ['A'..'Z'] ++ ['a'..'z'] ++ "_-")

-- | List of all breaks in the given text, with offsets in descending order,
-- including the status of the break if applicable.
--
-- Includes the start of the text (with offset 0) as the last list item.
--
-- Excludes the end of the text (with offset equal to the text length).
breaksDesc :: Breaker a -> Text -> [(Int, a)]
breaksDesc breaker input = map brkStartOffsetStatus $ breaksRight breaker input

brkStartOffsetStatus :: Break a -> (Int, a)
brkStartOffsetStatus brk = (brkStartOffset brk, brkStatus brk)

-- | The ICU library returns "breaks" as slices of text with two boundaries.
-- This gives the smaller of the two distances from the start of the text
-- to the boundaries of the break.
brkStartOffset :: Break a -> Int
brkStartOffset brk = lengthWord8 (brkPrefix brk)

-- | Given a list of offsets into a text in descending order, produce a list of
-- corresponding offsets into a slice of the text starting at a given offset.
subOffsetsDesc :: Int -> [(Int, a)] -> [(Int, a)]
subOffsetsDesc d = takeWhile valid . map adjust
    where
        valid (off, _) = off >= 0
        adjust (off, a) = (off - d, a)