1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
module Data.Text.ParagraphLayout.Internal.BiDiLevels
( Level
, TextLevels (TextLevels)
, WithLevel
, baseLevel
, directionLevel
, dropLevels
, firstStrongDirection
, headLevel
, level
, levelDirectionH
, tailLevels
, textLevels
)
where
import Data.Maybe (catMaybes, listToMaybe)
import Data.Text (Text, uncons, unpack)
import Data.Text.Glyphize (Direction (..))
import qualified Data.Text.ICU.Char as ICUChar
import Data.Word (Word8)
-- | BiDi level, between 0 and 125 inclusive.
-- Even values mean left-to-right text.
-- Odd values mean right-to-left text.
type Level = Word8
-- | Typeclass for any data structure with an associated BiDi level.
class WithLevel a where
level :: a -> Level
-- | BiDi levels for each character from a given input text,
-- plus the base level (for safe handling of empty text runs).
--
-- This wrapper is meant to ease the transition to a different internal
-- representation, if required when integrating with the ICU.
data TextLevels = TextLevels [Level] Level
deriving (Eq, Show)
headLevel :: TextLevels -> Level
headLevel (TextLevels xs base) = case xs of
[] -> base
(x : _) -> x
tailLevels :: TextLevels -> TextLevels
tailLevels (TextLevels xs base) = TextLevels (tail xs) base
dropLevels :: Int -> TextLevels -> TextLevels
dropLevels n (TextLevels xs base) = TextLevels (drop n xs) base
baseLevel :: TextLevels -> Level
baseLevel (TextLevels _ base) = base
-- | Determine the BiDi level of each character in the input text
-- using a simplified algorithm with the following main limitations:
--
-- - Explicit directional formatting characters are ignored.
-- This, by extension, means that levels cannot be controlled manually.
--
-- - Arabic and European numbers are treated equally and do not absorb
-- separators, terminators, or nonspacing marks.
--
-- - Paired brackets have no effect on text direction.
--
-- TODO: Use Haskell bindings to the ICU BiDi implementation once available.
textLevels :: Direction -> Text -> TextLevels
textLevels baseDir txt =
TextLevels (textLevels' base base 0 txt) base
where
base = directionLevel 0 baseDir
-- | Determine the BiDi level of each character in the input text
-- using a simplified algorithm.
textLevels' :: Level -> Level -> Int -> Text -> [Level]
textLevels' base previousLevel neutrals txt = case uncons txt of
Just (char, rest) -> case charLevel base char of
Just currentLevel ->
replicate neutrals (mergeLevel base previousLevel currentLevel)
++
currentLevel
:
textLevels' base currentLevel 0 rest
Nothing ->
textLevels' base previousLevel (neutrals + 1) rest
Nothing ->
replicate neutrals base
-- | Determine the level of a neutral character based on its surrounding levels.
mergeLevel :: Level -> Level -> Level -> Level
mergeLevel base prev cur
| prev == cur
= prev
| low > base && even high -- Avoid breaking a RTL run by numbers.
= low
| otherwise
= base
where
low = min prev cur
high = max prev cur
data SimpleType = StrongLTR | StrongRTL | Number | Neutral | Reset
-- | Determine the BiDi level of one character
-- using a simplified algorithm.
charLevel :: Level -> Char -> Maybe Level
charLevel base c = case simpleType c of
StrongLTR -> Just $ directionLevel base DirLTR
StrongRTL -> Just $ directionLevel base DirRTL
Number -> Just $ directionLevel (base + 1) DirLTR
Neutral -> Nothing
Reset -> Just base
-- | Reduce the character's BiDi class into a simpler category
-- for the purposes of the simplified algorithm.
simpleType :: Char -> SimpleType
simpleType c = case ICUChar.direction c of
-- Strong characters have a definitive type.
ICUChar.LeftToRight -> StrongLTR
ICUChar.RightToLeft -> StrongRTL
ICUChar.RightToLeftArabic -> StrongRTL
-- Ignoring separators and terminators, all numbers behave the same.
ICUChar.EuropeanNumber -> Number
ICUChar.ArabicNumber -> Number
-- Treating the following weak characters as neutral for simplicity.
ICUChar.EuropeanNumberSeparator -> Neutral
ICUChar.EuropeanNumberTerminator -> Neutral
ICUChar.CommonNumberSeparator -> Neutral
ICUChar.DirNonSpacingMark -> Neutral
ICUChar.BoundaryNeutral -> Neutral
-- The following characters are always neutral.
ICUChar.WhiteSpaceNeutral -> Neutral
ICUChar.OtherNeutral -> Neutral
-- The following characters get their level reset.
ICUChar.BlockSeparator -> Reset
ICUChar.SegmentSeparator -> Reset
-- Explicit formatting is not handled by this algorithm.
ICUChar.LeftToRightEmbedding -> Neutral
ICUChar.LeftToRightOverride -> Neutral
ICUChar.RightToLeftEmbedding -> Neutral
ICUChar.RightToLeftOverride -> Neutral
ICUChar.PopDirectionalFormat -> Neutral
ICUChar.FirstStrongIsolate -> Neutral
ICUChar.LeftToRightIsolate -> Neutral
ICUChar.RightToLeftIsolate -> Neutral
ICUChar.PopDirectionalIsolate -> Neutral
-- | `Just` the direction of the first strongly directional character,
-- or `Nothing` if there is no strongly directional character.
firstStrongDirection :: Text -> Maybe Direction
firstStrongDirection = firstJust . map strongDirection . unpack
-- | `Just` the direction of a strongly directional character,
-- or `Nothing` if the character is not strongly directional.
strongDirection :: Char -> Maybe Direction
strongDirection c = case ICUChar.direction c of
ICUChar.LeftToRight -> Just DirLTR
ICUChar.RightToLeft -> Just DirRTL
ICUChar.RightToLeftArabic -> Just DirRTL
_ -> Nothing
-- | The first `Just` value found in the input list, if there is one,
-- otherwise `Nothing`.
firstJust :: [Maybe a] -> Maybe a
firstJust = listToMaybe . catMaybes
-- | Convert embedding level to horizontal text direction.
levelDirectionH :: Level -> Direction
levelDirectionH lvl
| even lvl = DirLTR
| otherwise = DirRTL
-- | Convert text direction to the smallest corresponding embedding level,
-- but no smaller than the given minimum.
directionLevel :: Level -> Direction -> Level
directionLevel low DirLTR = smallestEvenAtLeast low
directionLevel low DirRTL = smallestOddAtLeast low
directionLevel low DirTTB = smallestEvenAtLeast low
directionLevel low DirBTT = smallestOddAtLeast low
-- | Smallest even integer greater than or equal to @x@.
smallestEvenAtLeast :: Integral a => a -> a
smallestEvenAtLeast x = if even x then x else x + 1
-- | Smallest odd integer greater than or equal to @x@.
smallestOddAtLeast :: Integral a => a -> a
smallestOddAtLeast x = if odd x then x else x + 1