~jaro/balkon

ref: 8ec0fe09ec568b7e5309df9dc37cd3c0f7635283 balkon/src/Data/Text/ParagraphLayout/Internal/BiDiLevels.hs -rw-r--r-- 6.4 KiB
8ec0fe09Jaro Improve BiDi handling of numbers and neutrals. 1 year, 5 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
module Data.Text.ParagraphLayout.Internal.BiDiLevels
    ( Level
    , TextLevels (TextLevels)
    , WithLevel
    , dropLevels
    , firstStrongDirection
    , headLevel
    , level
    , levelDirectionH
    , tailLevels
    , textLevels
    )
where

import Data.Maybe (catMaybes, listToMaybe)
import Data.Text (Text, uncons, unpack)
import Data.Text.Glyphize (Direction (..))
import qualified Data.Text.ICU.Char as ICUChar
import Data.Word (Word8)

-- | BiDi level, between 0 and 125 inclusive.
-- Even values mean left-to-right text.
-- Odd values mean right-to-left text.
type Level = Word8

-- | Typeclass for any data structure with an associated BiDi level.
class WithLevel a where
    level :: a -> Level

-- | BiDi levels for each character from a given input text.
--
-- This wrapper is meant to ease the transition to a different internal
-- representation, if required when integrating with the ICU.
newtype TextLevels = TextLevels [Level]
    deriving (Eq, Show)

headLevel :: TextLevels -> Level
headLevel (TextLevels xs) = head xs

tailLevels :: TextLevels -> TextLevels
tailLevels (TextLevels xs) = TextLevels (tail xs)

dropLevels :: Int -> TextLevels -> TextLevels
dropLevels n (TextLevels xs) = TextLevels (drop n xs)

-- | Determine the BiDi level of each character in the input text
-- using a simplified algorithm with the following main limitations:
--
-- - Explicit directional formatting characters are ignored.
--   This, by extension, means that levels cannot be controlled manually.
--
-- - Arabic and European numbers are treated equally and do not absorb
--   separators, terminators, or nonspacing marks.
--
-- - Paired brackets have no effect on text direction.
--
-- TODO: Use Haskell bindings to the ICU BiDi implementation once available.
textLevels :: Direction -> Text -> TextLevels
textLevels baseDir txt = TextLevels $ textLevels' baseLevel baseLevel 0 txt
    where
        baseLevel = directionLevel 0 baseDir

-- | Determine the BiDi level of each character in the input text
-- using a simplified algorithm.
textLevels' :: Level -> Level -> Int -> Text -> [Level]
textLevels' baseLevel previousLevel neutrals txt = case uncons txt of
    Just (char, rest) -> case charLevel baseLevel char of
        Just currentLevel ->
            replicate neutrals (mergeLevel baseLevel previousLevel currentLevel)
            ++
            currentLevel
            :
            textLevels' baseLevel currentLevel 0 rest
        Nothing ->
            textLevels' baseLevel previousLevel (neutrals + 1) rest
    Nothing ->
        replicate neutrals baseLevel

-- | Determine the level of a neutral character based on its surrounding levels.
mergeLevel :: Level -> Level -> Level -> Level
mergeLevel base prev cur
    | prev == cur
        = prev
    | low > base && even high -- Avoid breaking a RTL run by numbers.
        = low
    | otherwise
        = base
    where
        low = min prev cur
        high = max prev cur

data SimpleType = StrongLTR | StrongRTL | Number | Neutral | Reset

-- | Determine the BiDi level of one character
-- using a simplified algorithm.
charLevel :: Level -> Char -> Maybe Level
charLevel baseLevel c = case simpleType c of
    StrongLTR -> Just $ directionLevel baseLevel DirLTR
    StrongRTL -> Just $ directionLevel baseLevel DirRTL
    Number -> Just $ directionLevel (baseLevel + 1) DirLTR
    Neutral -> Nothing
    Reset -> Just baseLevel

-- | Reduce the character's BiDi class into a simpler category
-- for the purposes of the simplified algorithm.
simpleType :: Char -> SimpleType
simpleType c = case ICUChar.direction c of
    -- Strong characters have a definitive type.
    ICUChar.LeftToRight -> StrongLTR
    ICUChar.RightToLeft -> StrongRTL
    ICUChar.RightToLeftArabic -> StrongRTL
    -- Ignoring separators and terminators, all numbers behave the same.
    ICUChar.EuropeanNumber -> Number
    ICUChar.ArabicNumber -> Number
    -- Treating the following weak characters as neutral for simplicity.
    ICUChar.EuropeanNumberSeparator -> Neutral
    ICUChar.EuropeanNumberTerminator -> Neutral
    ICUChar.CommonNumberSeparator -> Neutral
    ICUChar.DirNonSpacingMark -> Neutral
    ICUChar.BoundaryNeutral -> Neutral
    -- The following characters are always neutral.
    ICUChar.WhiteSpaceNeutral -> Neutral
    ICUChar.OtherNeutral -> Neutral
    -- The following characters get their level reset.
    ICUChar.BlockSeparator -> Reset
    ICUChar.SegmentSeparator -> Reset
    -- Explicit formatting is not handled by this algorithm.
    ICUChar.LeftToRightEmbedding -> Neutral
    ICUChar.LeftToRightOverride -> Neutral
    ICUChar.RightToLeftEmbedding -> Neutral
    ICUChar.RightToLeftOverride -> Neutral
    ICUChar.PopDirectionalFormat -> Neutral
    ICUChar.FirstStrongIsolate -> Neutral
    ICUChar.LeftToRightIsolate -> Neutral
    ICUChar.RightToLeftIsolate -> Neutral
    ICUChar.PopDirectionalIsolate -> Neutral

-- | `Just` the direction of the first strongly directional character,
-- or `Nothing` if there is no strongly directional character.
firstStrongDirection :: Text -> Maybe Direction
firstStrongDirection = firstJust . map strongDirection . unpack

-- | `Just` the direction of a strongly directional character,
-- or `Nothing` if the character is not strongly directional.
strongDirection :: Char -> Maybe Direction
strongDirection c = case ICUChar.direction c of
    ICUChar.LeftToRight -> Just DirLTR
    ICUChar.RightToLeft -> Just DirRTL
    ICUChar.RightToLeftArabic -> Just DirRTL
    _ -> Nothing

-- | The first `Just` value found in the input list, if there is one,
-- otherwise `Nothing`.
firstJust :: [Maybe a] -> Maybe a
firstJust = listToMaybe . catMaybes

-- | Convert embedding level to horizontal text direction.
levelDirectionH :: Level -> Direction
levelDirectionH lvl
    | even lvl = DirLTR
    | otherwise = DirRTL

-- | Convert text direction to the smallest corresponding embedding level,
-- but no smaller than the given minimum.
directionLevel :: Level -> Direction -> Level
directionLevel low DirLTR = smallestEvenAtLeast low
directionLevel low DirRTL = smallestOddAtLeast low
directionLevel low DirTTB = smallestEvenAtLeast low
directionLevel low DirBTT = smallestOddAtLeast low

-- | Smallest even integer greater than or equal to @x@.
smallestEvenAtLeast :: Integral a => a -> a
smallestEvenAtLeast x = if even x then x else x + 1

-- | Smallest odd integer greater than or equal to @x@.
smallestOddAtLeast :: Integral a => a -> a
smallestOddAtLeast x = if odd x then x else x + 1