~jaro/balkon

ref: c9c57b600e59c89dc1f4831c5df03c8cf4725056 balkon/src/Data/Text/ParagraphLayout/Break.hs -rw-r--r-- 1.7 KiB
c9c57b60Jaro Test line splitting with many runs. 1 year, 8 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
-- | Boundary analysis using `Data.Text.ICU`, but returning numeric offsets
-- instead of text slices.
--
-- Within this module, each /offset/ refers to the number of `Word8` items
-- (also called UTF-8 code units or bytes) between the start of the input `Text`
-- and the position of the break. The internal offset of the `Text` from the
-- start of its underlying byte array is excluded.
module Data.Text.ParagraphLayout.Break (breaksDesc, subOffsetsDesc)
where

import Data.Text (Text)
import Data.Text.Foreign (lengthWord8)
import Data.Text.ICU (Break, Breaker, breaksRight, brkPrefix, brkStatus)

-- | List of all breaks in the given text, with offsets in descending order,
-- including the status of the break if applicable.
--
-- Includes the start of the text (with offset 0) as the last list item.
--
-- Excludes the end of the text (with offset equal to the text length).
breaksDesc :: Breaker a -> Text -> [(Int, a)]
breaksDesc breaker input = map brkStartOffsetStatus $ breaksRight breaker input

brkStartOffsetStatus :: Break a -> (Int, a)
brkStartOffsetStatus brk = (brkStartOffset brk, brkStatus brk)

-- | The ICU library returns "breaks" as slices of text with two boundaries.
-- This gives the smaller of the two distances from the start of the text
-- to the boundaries of the break.
brkStartOffset :: Break a -> Int
brkStartOffset brk = lengthWord8 (brkPrefix brk)

-- | Given a list of offsets into a text in descending order, produce a list of
-- corresponding offsets into a slice of the text starting at a given offset.
subOffsetsDesc :: Int -> [(Int, a)] -> [(Int, a)]
subOffsetsDesc d = takeWhile valid . map adjust
    where
        valid (off, _) = off >= 0
        adjust (off, a) = (off - d, a)