module Data.Text.ParagraphLayout.Internal.BiDiLevels
( Level
, TextLevels (TextLevels)
, WithLevel
, dropLevels
, firstStrongDirection
, headLevel
, level
, levelDirectionH
, tailLevels
, textLevels
)
where
import Data.Maybe (catMaybes, listToMaybe)
import Data.Text (Text, uncons, unpack)
import Data.Text.Glyphize (Direction (..))
import qualified Data.Text.ICU.Char as ICUChar
import Data.Word (Word8)
-- | BiDi level, between 0 and 125 inclusive.
-- Even values mean left-to-right text.
-- Odd values mean right-to-left text.
type Level = Word8
-- | Typeclass for any data structure with an associated BiDi level.
class WithLevel a where
level :: a -> Level
-- | BiDi levels for each character from a given input text.
--
-- This wrapper is meant to ease the transition to a different internal
-- representation, if required when integrating with the ICU.
newtype TextLevels = TextLevels [Level]
deriving (Eq, Show)
headLevel :: TextLevels -> Level
headLevel (TextLevels xs) = head xs
tailLevels :: TextLevels -> TextLevels
tailLevels (TextLevels xs) = TextLevels (tail xs)
dropLevels :: Int -> TextLevels -> TextLevels
dropLevels n (TextLevels xs) = TextLevels (drop n xs)
-- | Determine the BiDi level of each character in the input text
-- using a simplified algorithm with the following main limitations:
--
-- - Explicit directional formatting characters are ignored.
-- This, by extension, means that levels cannot be controlled manually.
--
-- - Arabic and European numbers are treated equally and do not absorb
-- separators, terminators, or nonspacing marks.
--
-- - Paired brackets have no effect on text direction.
--
-- TODO: Use Haskell bindings to the ICU BiDi implementation once available.
textLevels :: Direction -> Text -> TextLevels
textLevels baseDir txt = TextLevels $ textLevels' baseLevel baseLevel 0 txt
where
baseLevel = directionLevel 0 baseDir
-- | Determine the BiDi level of each character in the input text
-- using a simplified algorithm.
textLevels' :: Level -> Level -> Int -> Text -> [Level]
textLevels' baseLevel previousLevel neutrals txt = case uncons txt of
Just (char, rest) -> case charLevel baseLevel char of
Just currentLevel ->
replicate neutrals (mergeLevel baseLevel previousLevel currentLevel)
++
currentLevel
:
textLevels' baseLevel currentLevel 0 rest
Nothing ->
textLevels' baseLevel previousLevel (neutrals + 1) rest
Nothing ->
replicate neutrals baseLevel
-- | Determine the level of a neutral character based on its surrounding levels.
mergeLevel :: Level -> Level -> Level -> Level
mergeLevel base prev cur
| prev == cur
= prev
| low > base && even high -- Avoid breaking a RTL run by numbers.
= low
| otherwise
= base
where
low = min prev cur
high = max prev cur
data SimpleType = StrongLTR | StrongRTL | Number | Neutral | Reset
-- | Determine the BiDi level of one character
-- using a simplified algorithm.
charLevel :: Level -> Char -> Maybe Level
charLevel baseLevel c = case simpleType c of
StrongLTR -> Just $ directionLevel baseLevel DirLTR
StrongRTL -> Just $ directionLevel baseLevel DirRTL
Number -> Just $ directionLevel (baseLevel + 1) DirLTR
Neutral -> Nothing
Reset -> Just baseLevel
-- | Reduce the character's BiDi class into a simpler category
-- for the purposes of the simplified algorithm.
simpleType :: Char -> SimpleType
simpleType c = case ICUChar.direction c of
-- Strong characters have a definitive type.
ICUChar.LeftToRight -> StrongLTR
ICUChar.RightToLeft -> StrongRTL
ICUChar.RightToLeftArabic -> StrongRTL
-- Ignoring separators and terminators, all numbers behave the same.
ICUChar.EuropeanNumber -> Number
ICUChar.ArabicNumber -> Number
-- Treating the following weak characters as neutral for simplicity.
ICUChar.EuropeanNumberSeparator -> Neutral
ICUChar.EuropeanNumberTerminator -> Neutral
ICUChar.CommonNumberSeparator -> Neutral
ICUChar.DirNonSpacingMark -> Neutral
ICUChar.BoundaryNeutral -> Neutral
-- The following characters are always neutral.
ICUChar.WhiteSpaceNeutral -> Neutral
ICUChar.OtherNeutral -> Neutral
-- The following characters get their level reset.
ICUChar.BlockSeparator -> Reset
ICUChar.SegmentSeparator -> Reset
-- Explicit formatting is not handled by this algorithm.
ICUChar.LeftToRightEmbedding -> Neutral
ICUChar.LeftToRightOverride -> Neutral
ICUChar.RightToLeftEmbedding -> Neutral
ICUChar.RightToLeftOverride -> Neutral
ICUChar.PopDirectionalFormat -> Neutral
ICUChar.FirstStrongIsolate -> Neutral
ICUChar.LeftToRightIsolate -> Neutral
ICUChar.RightToLeftIsolate -> Neutral
ICUChar.PopDirectionalIsolate -> Neutral
-- | `Just` the direction of the first strongly directional character,
-- or `Nothing` if there is no strongly directional character.
firstStrongDirection :: Text -> Maybe Direction
firstStrongDirection = firstJust . map strongDirection . unpack
-- | `Just` the direction of a strongly directional character,
-- or `Nothing` if the character is not strongly directional.
strongDirection :: Char -> Maybe Direction
strongDirection c = case ICUChar.direction c of
ICUChar.LeftToRight -> Just DirLTR
ICUChar.RightToLeft -> Just DirRTL
ICUChar.RightToLeftArabic -> Just DirRTL
_ -> Nothing
-- | The first `Just` value found in the input list, if there is one,
-- otherwise `Nothing`.
firstJust :: [Maybe a] -> Maybe a
firstJust = listToMaybe . catMaybes
-- | Convert embedding level to horizontal text direction.
levelDirectionH :: Level -> Direction
levelDirectionH lvl
| even lvl = DirLTR
| otherwise = DirRTL
-- | Convert text direction to the smallest corresponding embedding level,
-- but no smaller than the given minimum.
directionLevel :: Level -> Direction -> Level
directionLevel low DirLTR = smallestEvenAtLeast low
directionLevel low DirRTL = smallestOddAtLeast low
directionLevel low DirTTB = smallestEvenAtLeast low
directionLevel low DirBTT = smallestOddAtLeast low
-- | Smallest even integer greater than or equal to @x@.
smallestEvenAtLeast :: Integral a => a -> a
smallestEvenAtLeast x = if even x then x else x + 1
-- | Smallest odd integer greater than or equal to @x@.
smallestOddAtLeast :: Integral a => a -> a
smallestOddAtLeast x = if odd x then x else x + 1