~jaro/balkon

ref: cf3ec3e35ebdbbe540c0a1490545622157271543 balkon/src/Data/Text/ParagraphLayout/Internal/BiDiLevels.hs -rw-r--r-- 6.5 KiB
cf3ec3e3Jaro Allow trimmed whitespace to generate empty fragments. 1 year, 2 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
module Data.Text.ParagraphLayout.Internal.BiDiLevels
    ( Level
    , TextLevels (TextLevels)
    , WithLevel
    , baseLevel
    , directionLevel
    , dropLevels
    , firstStrongDirection
    , headLevel
    , level
    , levelDirectionH
    , tailLevels
    , textLevels
    )
where

import Data.Maybe (catMaybes, listToMaybe)
import Data.Text (Text, uncons, unpack)
import Data.Text.Glyphize (Direction (..))
import qualified Data.Text.ICU.Char as ICUChar
import Data.Word (Word8)

-- | BiDi level, between 0 and 125 inclusive.
-- Even values mean left-to-right text.
-- Odd values mean right-to-left text.
type Level = Word8

-- | Typeclass for any data structure with an associated BiDi level.
class WithLevel a where
    level :: a -> Level

-- | BiDi levels for each character from a given input text,
-- plus the base level (for safe handling of empty text runs).
--
-- This wrapper is meant to ease the transition to a different internal
-- representation, if required when integrating with the ICU.
data TextLevels = TextLevels [Level] Level
    deriving (Eq, Show)

headLevel :: TextLevels -> Level
headLevel (TextLevels xs base) = case xs of
    [] -> base
    (x : _) -> x

tailLevels :: TextLevels -> TextLevels
tailLevels (TextLevels xs base) = TextLevels (tail xs) base

dropLevels :: Int -> TextLevels -> TextLevels
dropLevels n (TextLevels xs base) = TextLevels (drop n xs) base

baseLevel :: TextLevels -> Level
baseLevel (TextLevels _ base) = base

-- | Determine the BiDi level of each character in the input text
-- using a simplified algorithm with the following main limitations:
--
-- - Explicit directional formatting characters are ignored.
--   This, by extension, means that levels cannot be controlled manually.
--
-- - Arabic and European numbers are treated equally and do not absorb
--   separators, terminators, or nonspacing marks.
--
-- - Paired brackets have no effect on text direction.
--
-- TODO: Use Haskell bindings to the ICU BiDi implementation once available.
textLevels :: Direction -> Text -> TextLevels
textLevels baseDir txt =
    TextLevels (textLevels' base base 0 txt) base
    where
        base = directionLevel 0 baseDir

-- | Determine the BiDi level of each character in the input text
-- using a simplified algorithm.
textLevels' :: Level -> Level -> Int -> Text -> [Level]
textLevels' base previousLevel neutrals txt = case uncons txt of
    Just (char, rest) -> case charLevel base char of
        Just currentLevel ->
            replicate neutrals (mergeLevel base previousLevel currentLevel)
            ++
            currentLevel
            :
            textLevels' base currentLevel 0 rest
        Nothing ->
            textLevels' base previousLevel (neutrals + 1) rest
    Nothing ->
        replicate neutrals base

-- | Determine the level of a neutral character based on its surrounding levels.
mergeLevel :: Level -> Level -> Level -> Level
mergeLevel base prev cur
    | prev == cur
        = prev
    | low > base && even high -- Avoid breaking a RTL run by numbers.
        = low
    | otherwise
        = base
    where
        low = min prev cur
        high = max prev cur

data SimpleType = StrongLTR | StrongRTL | Number | Neutral | Reset

-- | Determine the BiDi level of one character
-- using a simplified algorithm.
charLevel :: Level -> Char -> Maybe Level
charLevel base c = case simpleType c of
    StrongLTR -> Just $ directionLevel base DirLTR
    StrongRTL -> Just $ directionLevel base DirRTL
    Number -> Just $ directionLevel (base + 1) DirLTR
    Neutral -> Nothing
    Reset -> Just base

-- | Reduce the character's BiDi class into a simpler category
-- for the purposes of the simplified algorithm.
simpleType :: Char -> SimpleType
simpleType c = case ICUChar.direction c of
    -- Strong characters have a definitive type.
    ICUChar.LeftToRight -> StrongLTR
    ICUChar.RightToLeft -> StrongRTL
    ICUChar.RightToLeftArabic -> StrongRTL
    -- Ignoring separators and terminators, all numbers behave the same.
    ICUChar.EuropeanNumber -> Number
    ICUChar.ArabicNumber -> Number
    -- Treating the following weak characters as neutral for simplicity.
    ICUChar.EuropeanNumberSeparator -> Neutral
    ICUChar.EuropeanNumberTerminator -> Neutral
    ICUChar.CommonNumberSeparator -> Neutral
    ICUChar.DirNonSpacingMark -> Neutral
    ICUChar.BoundaryNeutral -> Neutral
    -- The following characters are always neutral.
    ICUChar.WhiteSpaceNeutral -> Neutral
    ICUChar.OtherNeutral -> Neutral
    -- The following characters get their level reset.
    ICUChar.BlockSeparator -> Reset
    ICUChar.SegmentSeparator -> Reset
    -- Explicit formatting is not handled by this algorithm.
    ICUChar.LeftToRightEmbedding -> Neutral
    ICUChar.LeftToRightOverride -> Neutral
    ICUChar.RightToLeftEmbedding -> Neutral
    ICUChar.RightToLeftOverride -> Neutral
    ICUChar.PopDirectionalFormat -> Neutral
    ICUChar.FirstStrongIsolate -> Neutral
    ICUChar.LeftToRightIsolate -> Neutral
    ICUChar.RightToLeftIsolate -> Neutral
    ICUChar.PopDirectionalIsolate -> Neutral

-- | `Just` the direction of the first strongly directional character,
-- or `Nothing` if there is no strongly directional character.
firstStrongDirection :: Text -> Maybe Direction
firstStrongDirection = firstJust . map strongDirection . unpack

-- | `Just` the direction of a strongly directional character,
-- or `Nothing` if the character is not strongly directional.
strongDirection :: Char -> Maybe Direction
strongDirection c = case ICUChar.direction c of
    ICUChar.LeftToRight -> Just DirLTR
    ICUChar.RightToLeft -> Just DirRTL
    ICUChar.RightToLeftArabic -> Just DirRTL
    _ -> Nothing

-- | The first `Just` value found in the input list, if there is one,
-- otherwise `Nothing`.
firstJust :: [Maybe a] -> Maybe a
firstJust = listToMaybe . catMaybes

-- | Convert embedding level to horizontal text direction.
levelDirectionH :: Level -> Direction
levelDirectionH lvl
    | even lvl = DirLTR
    | otherwise = DirRTL

-- | Convert text direction to the smallest corresponding embedding level,
-- but no smaller than the given minimum.
directionLevel :: Level -> Direction -> Level
directionLevel low DirLTR = smallestEvenAtLeast low
directionLevel low DirRTL = smallestOddAtLeast low
directionLevel low DirTTB = smallestEvenAtLeast low
directionLevel low DirBTT = smallestOddAtLeast low

-- | Smallest even integer greater than or equal to @x@.
smallestEvenAtLeast :: Integral a => a -> a
smallestEvenAtLeast x = if even x then x else x + 1

-- | Smallest odd integer greater than or equal to @x@.
smallestOddAtLeast :: Integral a => a -> a
smallestOddAtLeast x = if odd x then x else x + 1