module Data.Text.ParagraphLayout.Internal.BiDiLevels
( Level
, TextLevels (TextLevels)
, WithLevel
, baseLevel
, directionLevel
, dropLevels
, firstStrongDirection
, headLevel
, level
, levelDirectionH
, tailLevels
, textLevels
)
where
import Data.Maybe (catMaybes, listToMaybe)
import Data.Text (Text, uncons, unpack)
import Data.Text.Glyphize (Direction (..))
import qualified Data.Text.ICU.Char as ICUChar
import Data.Word (Word8)
-- | BiDi level, between 0 and 125 inclusive.
-- Even values mean left-to-right text.
-- Odd values mean right-to-left text.
type Level = Word8
-- | Typeclass for any data structure with an associated BiDi level.
class WithLevel a where
level :: a -> Level
-- | BiDi levels for each character from a given input text,
-- plus the base level (for safe handling of empty text runs).
--
-- This wrapper is meant to ease the transition to a different internal
-- representation, if required when integrating with the ICU.
data TextLevels = TextLevels [Level] Level
deriving (Eq, Show)
headLevel :: TextLevels -> Level
headLevel (TextLevels xs base) = case xs of
[] -> base
(x : _) -> x
tailLevels :: TextLevels -> TextLevels
tailLevels (TextLevels xs base) = TextLevels (tail xs) base
dropLevels :: Int -> TextLevels -> TextLevels
dropLevels n (TextLevels xs base) = TextLevels (drop n xs) base
baseLevel :: TextLevels -> Level
baseLevel (TextLevels _ base) = base
-- | Determine the BiDi level of each character in the input text
-- using a simplified algorithm with the following main limitations:
--
-- - Explicit directional formatting characters are ignored.
-- This, by extension, means that levels cannot be controlled manually.
--
-- - Arabic and European numbers are treated equally and do not absorb
-- separators, terminators, or nonspacing marks.
--
-- - Paired brackets have no effect on text direction.
--
-- TODO: Use Haskell bindings to the ICU BiDi implementation once available.
textLevels :: Direction -> Text -> TextLevels
textLevels baseDir txt =
TextLevels (textLevels' base base 0 txt) base
where
base = directionLevel 0 baseDir
-- | Determine the BiDi level of each character in the input text
-- using a simplified algorithm.
textLevels' :: Level -> Level -> Int -> Text -> [Level]
textLevels' base previousLevel neutrals txt = case uncons txt of
Just (char, rest) -> case charLevel base char of
Just currentLevel ->
replicate neutrals (mergeLevel base previousLevel currentLevel)
++
currentLevel
:
textLevels' base currentLevel 0 rest
Nothing ->
textLevels' base previousLevel (neutrals + 1) rest
Nothing ->
replicate neutrals base
-- | Determine the level of a neutral character based on its surrounding levels.
mergeLevel :: Level -> Level -> Level -> Level
mergeLevel base prev cur
| prev == cur
= prev
| low > base && even high -- Avoid breaking a RTL run by numbers.
= low
| otherwise
= base
where
low = min prev cur
high = max prev cur
data SimpleType = StrongLTR | StrongRTL | Number | Neutral | Reset
-- | Determine the BiDi level of one character
-- using a simplified algorithm.
charLevel :: Level -> Char -> Maybe Level
charLevel base c = case simpleType c of
StrongLTR -> Just $ directionLevel base DirLTR
StrongRTL -> Just $ directionLevel base DirRTL
Number -> Just $ directionLevel (base + 1) DirLTR
Neutral -> Nothing
Reset -> Just base
-- | Reduce the character's BiDi class into a simpler category
-- for the purposes of the simplified algorithm.
simpleType :: Char -> SimpleType
simpleType c = case ICUChar.direction c of
-- Strong characters have a definitive type.
ICUChar.LeftToRight -> StrongLTR
ICUChar.RightToLeft -> StrongRTL
ICUChar.RightToLeftArabic -> StrongRTL
-- Ignoring separators and terminators, all numbers behave the same.
ICUChar.EuropeanNumber -> Number
ICUChar.ArabicNumber -> Number
-- Treating the following weak characters as neutral for simplicity.
ICUChar.EuropeanNumberSeparator -> Neutral
ICUChar.EuropeanNumberTerminator -> Neutral
ICUChar.CommonNumberSeparator -> Neutral
ICUChar.DirNonSpacingMark -> Neutral
ICUChar.BoundaryNeutral -> Neutral
-- The following characters are always neutral.
ICUChar.WhiteSpaceNeutral -> Neutral
ICUChar.OtherNeutral -> Neutral
-- The following characters get their level reset.
ICUChar.BlockSeparator -> Reset
ICUChar.SegmentSeparator -> Reset
-- Explicit formatting is not handled by this algorithm.
ICUChar.LeftToRightEmbedding -> Neutral
ICUChar.LeftToRightOverride -> Neutral
ICUChar.RightToLeftEmbedding -> Neutral
ICUChar.RightToLeftOverride -> Neutral
ICUChar.PopDirectionalFormat -> Neutral
ICUChar.FirstStrongIsolate -> Neutral
ICUChar.LeftToRightIsolate -> Neutral
ICUChar.RightToLeftIsolate -> Neutral
ICUChar.PopDirectionalIsolate -> Neutral
-- | `Just` the direction of the first strongly directional character,
-- or `Nothing` if there is no strongly directional character.
firstStrongDirection :: Text -> Maybe Direction
firstStrongDirection = firstJust . map strongDirection . unpack
-- | `Just` the direction of a strongly directional character,
-- or `Nothing` if the character is not strongly directional.
strongDirection :: Char -> Maybe Direction
strongDirection c = case ICUChar.direction c of
ICUChar.LeftToRight -> Just DirLTR
ICUChar.RightToLeft -> Just DirRTL
ICUChar.RightToLeftArabic -> Just DirRTL
_ -> Nothing
-- | The first `Just` value found in the input list, if there is one,
-- otherwise `Nothing`.
firstJust :: [Maybe a] -> Maybe a
firstJust = listToMaybe . catMaybes
-- | Convert embedding level to horizontal text direction.
levelDirectionH :: Level -> Direction
levelDirectionH lvl
| even lvl = DirLTR
| otherwise = DirRTL
-- | Convert text direction to the smallest corresponding embedding level,
-- but no smaller than the given minimum.
directionLevel :: Level -> Direction -> Level
directionLevel low DirLTR = smallestEvenAtLeast low
directionLevel low DirRTL = smallestOddAtLeast low
directionLevel low DirTTB = smallestEvenAtLeast low
directionLevel low DirBTT = smallestOddAtLeast low
-- | Smallest even integer greater than or equal to @x@.
smallestEvenAtLeast :: Integral a => a -> a
smallestEvenAtLeast x = if even x then x else x + 1
-- | Smallest odd integer greater than or equal to @x@.
smallestOddAtLeast :: Integral a => a -> a
smallestOddAtLeast x = if odd x then x else x + 1