{-# LANGUAGE MagicHash, UnliftedFFITypes, DeriveGeneric #-}
module Data.Text.Glyphize.Buffer where
import qualified Data.Text.Internal.Lazy as Lazy
import qualified Data.Text.Lazy as Lazy
import qualified Data.Text.Internal as Txt
import Data.Char (toUpper, toLower)
import Control.Monad (forM)
import Control.Exception (bracket)
--- To fill computed text property.
import Data.Text.Encoding (decodeUtf8Lenient)
import Codec.Binary.UTF8.Light (encodeUTF8, w2c, c2w)
import qualified Data.Text.Array as A
import GHC.Exts (ByteArray#, sizeofByteArray#, Int#)
import Data.Word (Word32)
import Data.Int (Int32)
import Data.Bits ((.|.), (.&.), shiftR, shiftL, testBit)
import System.IO.Unsafe (unsafePerformIO)
import Foreign.Marshal.Alloc (alloca, allocaBytes)
import Foreign.Ptr
import Foreign.C.String (CString, withCString, peekCString)
import Foreign.Storable (Storable(..))
import GHC.Generics (Generic(..))
import Foreign.Storable.Generic (GStorable(..))
------
--- Public Datastructures
------
-- | Text to be shaped or the resulting glyphs, for which language/script/direction/etc.
data Buffer = Buffer {
text :: Lazy.Text,
-- ^ The Unicode text, in visual order, for HarfBuzz to convert into glyphs.
contentType :: Maybe ContentType,
-- ^ What the bytes of the ByteString contents represents,
-- namely unicode characters (before shaping) or glyphs (result of shaping).
-- Typically callers should leave this as `Just ContentTypeUnicode`.
direction :: Maybe Direction,
-- ^ The text flow direction of the buffer.
-- No shaping can happen without setting buffer direction, and it controls
-- the visual direction for the output glyphs; for RTL direction the glyphs
-- will be reversed. Many layout features depend on the proper setting of
-- the direction, for example, reversing RTL text before shaping,
-- then shaping with LTR direction is not the same as keeping the text in
-- logical order and shaping with RTL direction.
script :: Maybe String,
-- ^ Script is crucial for choosing the proper shaping behaviour for scripts
-- that require it (e.g. Arabic) and the which OpenType features defined in
-- the font to be applied.
language :: Maybe String,
-- ^ Languages are crucial for selecting which OpenType feature to apply to
-- the buffer which can result in applying language-specific behaviour.
-- Languages are orthogonal to the scripts, and though they are related,
-- they are different concepts and should not be confused with each other.
beginsText :: Bool,
-- ^ special handling of the beginning of text paragraph can be applied to
-- this buffer. Should usually be set, unless you are passing to the buffer
-- only part of the text without the full context.
endsText :: Bool,
-- ^ special handling of the end of text paragraph can be applied to this buffer.
preserveDefaultIgnorables :: Bool,
-- ^ character with Default_Ignorable Unicode property should use the
-- corresponding glyph from the font, instead of hiding them (done by
-- replacing them with the space glyph and zeroing the advance width.)
-- Takes precedance over `removeDefaultIgnorables`.
removeDefaultIgnorables :: Bool,
-- ^ character with Default_Ignorable Unicode property should be removed
-- from glyph string instead of hiding them (done by replacing them with
-- the space glyph and zeroing the advance width.)
don'tInsertDottedCircle :: Bool,
-- ^ a dotted circle should not be inserted in the rendering of incorrect
-- character sequences (such as <0905 093E>).
clusterLevel :: ClusterLevel,
-- ^ dictates one aspect of how HarfBuzz will treat non-base characters
-- during shaping.
invisibleGlyph :: Char,
-- ^ The glyph number that replaces invisible characters in the
-- shaping result. If set to zero (default), the glyph for the U+0020
-- SPACE character is used. Otherwise, this value is used verbatim.
replacementCodepoint :: Char,
-- ^ the glyph number that replaces invalid entries for a given encoding
-- when adding text to buffer.
notFoundGlyph :: Char
-- ^ the glyph number that replaces replaces characters not found in the font.
} deriving (Eq, Show, Read, Ord)
-- | Whether the given text is Unicode or font-specific "glyphs".
data ContentType = ContentTypeUnicode | ContentTypeGlyphs deriving (Eq, Show, Read, Ord)
-- | Defines how fine the groupings represented by `GlyphInfo`'s `cluster` property are.`
data ClusterLevel = ClusterMonotoneGraphemes | ClusterMonotoneChars | ClusterChars
deriving (Eq, Show, Read, Ord)
-- | An empty buffer with sensible default properties.
defaultBuffer = Buffer {
text = Lazy.empty,
contentType = Just ContentTypeUnicode,
direction = Nothing,
script = Nothing,
language = Nothing,
beginsText = True,
endsText = True,
preserveDefaultIgnorables = False,
removeDefaultIgnorables = False,
don'tInsertDottedCircle = False,
clusterLevel = ClusterMonotoneGraphemes,
invisibleGlyph = ' ',
replacementCodepoint = '\xFFFD',
notFoundGlyph = '\0'
}
------
--- Directions
------
-- | The direction of a text segment or buffer.
data Direction = DirLTR | DirRTL | DirTTB | DirBTT deriving (Eq, Show, Read, Ord)
-- | Converts a string to an hb_direction_t.
-- Matching is loose and applies only to the first letter. For examples,
-- "LTR" and "left-to-right" will both return HB_DIRECTION_LTR.
dirFromStr ('L':_) = Just DirLTR
dirFromStr ('l':_) = Just DirLTR
dirFromStr ('R':_) = Just DirRTL
dirFromStr ('r':_) = Just DirRTL
dirFromStr ('T':_) = Just DirTTB
dirFromStr ('t':_) = Just DirTTB
dirFromStr ('B':_) = Just DirBTT
dirFromStr ('b':_) = Just DirBTT
dirFromStr _ = Nothing
-- | Converts an hb_direction_t to a string.
dirToStr DirLTR = "ltr"
dirToStr DirRTL = "rtl"
dirToStr DirTTB = "ttb"
dirToStr DirBTT = "btt"
-- | Reverses a text direction.
dirReverse DirLTR = DirRTL
dirReverse DirRTL = DirLTR
dirReverse DirTTB = DirBTT
dirReverse DirBTT = DirTTB
-- | Tests whether a text direction moves backward
-- (from right to left, or from bottom to top).
dirBackward dir = dir `Prelude.elem` [DirRTL, DirBTT]
-- | Tests whether a text direction moves forward
-- (from left to right, or from top to bottom).
dirForward dir = dir `Prelude.elem` [DirLTR, DirTTB]
-- | Tests whether a text direction is horizontal.
dirHorizontal dir = dir `Prelude.elem` [DirLTR, DirRTL]
-- | Tests whether a text direction is vertical.
dirVertical dir = dir `Prelude.elem` [DirTTB, DirBTT]
-- | Converts a `Direction` to C encoding.
dir2int Nothing = 0
dir2int (Just DirLTR) = 4
dir2int (Just DirRTL) = 5
dir2int (Just DirTTB) = 6
dir2int (Just DirBTT) = 7
-- | Sets `direction` property on C `Buffer'` struct.
foreign import ccall "hb_buffer_set_direction" hb_buffer_set_direction
:: Buffer' -> Int -> IO ()
-- | Converts a `Direction` from C encoding.
int2dir 4 = Just DirLTR
int2dir 5 = Just DirRTL
int2dir 6 = Just DirTTB
int2dir 7 = Just DirBTT
int2dir _ = Nothing
-- | Fetches the hb_direction_t of a script when it is set horizontally.
-- All right-to-left scripts will return `DirRTL`.
-- All left-to-right scripts will return `DirLTR`.
-- Scripts that can be written either horizontally or vertically will return `Nothing`.
-- Unknown scripts will return `DirLTR`.
scriptHorizontalDir :: String -> Maybe Direction
scriptHorizontalDir = int2dir . hb_script_get_horizontal_direction . script_from_string
foreign import ccall "hb_script_get_horizontal_direction" hb_script_get_horizontal_direction
:: Word32 -> Int
------
--- Locales
------
data Language'
-- | Represents a natural written language.
-- Corresponds to a BCP47 language tag.
type Language = Ptr Language'
-- | Fetch the default language from current locale.
-- NOTE that the first time this function is called, it calls (C code)
-- "setlocale (LC_CTYPE, nullptr)" to fetch current locale.
-- The underlying setlocale function is, in many implementations, NOT threadsafe.
-- To avoid problems, call this function once before multiple threads can call it.
-- This function may be used to fill in missing fields on a `Buffer`.
languageDefault :: IO String
languageDefault = hb_language_get_default >>= hb_language_to_string >>= peekCString
foreign import ccall "hb_language_to_string" hb_language_to_string :: Language -> IO CString
foreign import ccall "hb_language_get_default" hb_language_get_default :: IO Language
-- | Converts a `String` representing a BCP 47 language tag to the corresponding `Language`.
hb_language_from_string :: String -> IO Language
hb_language_from_string str =
withCString str $ \str' -> hb_language_from_string' str' (-1)
foreign import ccall "hb_language_from_string" hb_language_from_string'
:: CString -> Int -> IO Language
{-
-- | Check whether a second language tag is the same or a more specific version
-- of the provided language tag.
-- For example, "fa_IR.utf8" is a more specific tag for "fa" or for "fa_IR".
languageMatches :: String -> String -> Bool
languageMatches lang specific = unsafePerformIO $ do
lang' <- hb_language_from_string lang
specific' <- hb_language_from_string specific
hb_language_matches lang' specific'
foreign import ccall "hb_language_matches" hb_language_matches :: Language -> Language -> IO Bool-}
------
--- FFI Support
------
-- | Directly corresponds to "hb_buffer_t".
data Buffer''
type Buffer' = Ptr Buffer''
-- | Temporarily allocates a `Buffer'`
withNewBuffer :: (Buffer' -> IO a) -> IO a
withNewBuffer cb = bracket hb_buffer_create hb_buffer_destroy cb
foreign import ccall "hb_buffer_create" hb_buffer_create :: IO Buffer'
foreign import ccall "hb_buffer_destroy" hb_buffer_destroy :: Buffer' -> IO ()
-- | Decodes given lazy `Text` into given `Buffer'`.
-- Should be valid Unicode data.
-- Captures a few trailing & preceding chars when possible to give additional
-- context to the shaping.
bufferWithText _ Lazy.Empty cb = cb
bufferWithText buffer txt@(Lazy.Chunk (Txt.Text (A.ByteArray arr) offset length) txts) cb = do
hb_buffer_add_utf8 buffer arr (sizeofByteArray# arr) (toEnum offset) length
bufferWithText buffer txts cb
foreign import ccall "hb_buffer_add_utf8" hb_buffer_add_utf8
:: Buffer' -> ByteArray# -> Int# -> Word -> Int -> IO ()
-- | Converts initial char to uppercase & all others to lowercase.
-- Internal utility for reimplementation `script_from_string`.
titlecase :: String -> String
titlecase "" = ""
titlecase (c:cs) = toUpper c : Prelude.map toLower cs
-- | Converts a string str representing an ISO 15924 script tag to a corresponding "tag" `Word32`.
script_from_string :: String -> Word32
script_from_string str = tag_from_string $ case titlecase str of
'Q':'a':'a':'i':_ -> "Zinh"
'Q':'a':'a':'c':_ -> "Copt"
'A':'r':'a':'n':_ -> "Arab"
'C':'y':'r':'s':_ -> "Cyrl"
'G':'e':'o':'k':_ -> "Geor"
'H':'a':'n':'s':_ -> "Hani"
'H':'a':'n':'t':_ -> "Hani"
'J':'a':'m':'o':_ -> "Hang"
'L':'a':'t':'f':_ -> "Latn"
'L':'a':'t':'g':_ -> "Latn"
'S':'y':'r':'e':_ -> "Syrc"
'S':'y':'r':'j':_ -> "Syrc"
'S':'y':'r':'n':_ -> "Syrc"
x -> x
-- | Converts a `String` into a "tag" `Word32`. Valid tags are 4 `Char`s.
-- Shorter input `String`s will be padded with spaces.
-- Longer input strings will be truncated.
tag_from_string :: String -> Word32
tag_from_string str = case str ++ Prelude.repeat ' ' of
c1:c2:c3:c4:_ -> Prelude.foldl (.|.) 0 [
shiftL (c2w c1 .&. 0x7) 24,
shiftL (c2w c2 .&. 0x7) 16,
shiftL (c2w c3 .&. 0x7) 8,
shiftL (c2w c4 .&. 0x7) 0
]
_ -> 0
-- | Converts a "tag" `Word32` into a 4 `Char` `String`.
tag_to_string :: Word32 -> String
tag_to_string tag = [
w2c (shiftR tag 24 .&. 0x7),
w2c (shiftR tag 16 .&. 0x7),
w2c (shiftR tag 8 .&. 0x7),
w2c (shiftR tag 0 .&. 0x7)
]
------
--- Haskell-to-C conversion
------
-- | Temporarily allocates a `Buffer'` corresponding to the given `Buffer`
-- to be processed entirely within the given callback.
withBuffer :: Buffer -> (Buffer' -> IO a) -> IO a
withBuffer buf cb = withNewBuffer $ \buf' -> bufferWithText buf' (text buf) $ do
hb_buffer_set_content_type buf' $ case contentType buf of
Nothing -> 0
Just ContentTypeUnicode -> 1
Just ContentTypeGlyphs -> 2
hb_buffer_set_direction buf' $ dir2int $ direction buf
case script buf of
Just script' -> hb_buffer_set_script buf' $ script_from_string script'
Nothing -> return ()
case language buf of
Just lang' -> hb_buffer_set_language buf' =<< hb_language_from_string lang'
Nothing -> return ()
hb_buffer_set_flags buf' $ Prelude.foldl (.|.) 0 [
if beginsText buf then 1 else 0,
if endsText buf then 2 else 0,
if preserveDefaultIgnorables buf then 4 else 0,
if removeDefaultIgnorables buf then 8 else 0,
if don'tInsertDottedCircle buf then 16 else 0
]
hb_buffer_set_cluster_level buf' $ case clusterLevel buf of
ClusterMonotoneGraphemes -> 0
ClusterMonotoneChars -> 1
ClusterChars -> 2
hb_buffer_set_invisible_glyph buf' $ c2w $ invisibleGlyph buf
hb_buffer_set_replacement_codepoint buf' $ c2w $ replacementCodepoint buf
-- hb_buffer_set_not_found_glyph buf' $ c2w $ notFoundGlyph buf
case (contentType buf, direction buf, script buf, language buf) of
(Just ContentTypeUnicode, Nothing, _, _) -> hb_buffer_guess_segment_properties buf'
(Just ContentTypeUnicode, _, Nothing, _) -> hb_buffer_guess_segment_properties buf'
(Just ContentTypeUnicode, _, _, Nothing) -> hb_buffer_guess_segment_properties buf'
cb buf'
foreign import ccall "hb_buffer_set_content_type" hb_buffer_set_content_type
:: Buffer' -> Int -> IO ()
foreign import ccall "hb_buffer_set_script" hb_buffer_set_script
:: Buffer' -> Word32 -> IO ()
foreign import ccall "hb_buffer_set_language" hb_buffer_set_language
:: Buffer' -> Language -> IO ()
foreign import ccall "hb_buffer_set_flags" hb_buffer_set_flags :: Buffer' -> Int -> IO ()
foreign import ccall "hb_buffer_set_cluster_level" hb_buffer_set_cluster_level
:: Buffer' -> Int -> IO ()
foreign import ccall "hb_buffer_set_invisible_glyph" hb_buffer_set_invisible_glyph
:: Buffer' -> Word32 -> IO ()
foreign import ccall "hb_buffer_set_replacement_codepoint" hb_buffer_set_replacement_codepoint
:: Buffer' -> Word32 -> IO ()
--foreign import ccall "hb_buffer_set_not_found_glyph" hb_buffer_set_not_found_glyph
-- :: Buffer' -> Word32 -> IO ()
foreign import ccall "hb_buffer_guess_segment_properties" hb_buffer_guess_segment_properties
:: Buffer' -> IO ()
------
--- C-to-Haskell conversion
------
-- | Holds information about the glyphs & their relation to input text.
data GlyphInfo = GlyphInfo {
codepoint :: Word32,
-- ^ Glyph index (or unicode codepoint)
cluster :: Word32,
-- ^ The index of the character in the original text that corresponds to
-- this `GlyphInfo`. More than one `GlyphInfo` may have the same `cluster`
-- value if they resulted from the same character, & when more than one
-- character gets merged into the same glyph `GlyphInfo` will have the
-- smallest cluster value of them.
-- By default some characters are merged into the same cluster even when
-- they are seperate glyphs, `Buffer`'s `clusterLevel` property allows
-- selecting more fine grained cluster handling.
unsafeToBreak :: Bool,
-- ^ Indicates that if input text is broken at the beginning of the cluster
-- this glyph is part of, then both sides need to be re-shaped,
-- as the result might be different.
-- On the flip side, it means that when this flag is not present,
-- then it is safe to break the glyph-run at the beginning of this cluster,
-- and the two sides will represent the exact same result one would get
-- if breaking input text at the beginning of this cluster and shaping
-- the two sides separately. This can be used to optimize paragraph layout,
-- by avoiding re-shaping of each line after line-breaking.
unsafeToConcat :: Bool,
-- ^ Indicates that if input text is changed on one side of the beginning
-- of the cluster this glyph is part of, then the shaping results for
-- the other side might change.
-- Note that the absence of this flag will NOT by itself mean that
-- it IS safe to concat text. Only two pieces of text both of which
-- clear of this flag can be concatenated safely.
-- See https://harfbuzz.github.io/harfbuzz-hb-buffer.html#HB_GLYPH_FLAG_UNSAFE_TO_CONCAT
-- for more details.
safeToInsertTatweel :: Bool
-- ^ In scripts that use elongation (Arabic, Mongolian, Syriac, etc.),
-- this flag signifies that it is safe to insert a U+0640 TATWEEL character
-- before this cluster for elongation.
-- This flag does not determine the script-specific elongation places,
-- but only when it is safe to do the elongation without interrupting text shaping.
} deriving (Show, Read, Eq)
instance Storable GlyphInfo where
sizeOf _ = sizeOf (undefined :: Word32) * 5
alignment _ = alignment (undefined :: Word32)
peek ptr = do
let ptr' :: Ptr Word32
ptr' = castPtr ptr
-- Ignore private fields.
codepoint' <- ptr' `peekElemOff` 0
mask <- ptr' `peekElemOff` 1
cluster' <- ptr' `peekElemOff` 2
return $ GlyphInfo codepoint' cluster' (mask `testBit` 1) (mask `testBit` 2) (mask `testBit` 3)
poke ptr (GlyphInfo codepoint' cluster' flag1 flag2 flag3) = do
-- Zero private fields.
let ptr' :: Ptr Word32
ptr' = castPtr ptr
pokeElemOff ptr' 0 codepoint'
pokeElemOff ptr' 1 $ Prelude.foldl (.|.) 0 [
if flag1 then 1 else 0,
if flag2 then 2 else 0,
if flag3 then 4 else 0
]
pokeElemOff ptr' 2 cluster'
pokeElemOff ptr' 3 0
pokeElemOff ptr' 4 0
-- | Decodes `Buffer'`'s glyph information array.'
glyphInfos buf' = do
arr <- hb_buffer_get_glyph_infos buf' nullPtr
length <- hb_buffer_get_length buf'
if length == 0 || arr == nullPtr
then return []
else forM [0..fromEnum length - 1] $ peekElemOff arr
foreign import ccall "hb_buffer_get_glyph_infos" hb_buffer_get_glyph_infos
:: Buffer' -> Ptr Word -> IO (Ptr GlyphInfo)
foreign import ccall "hb_buffer_get_length" hb_buffer_get_length :: Buffer' -> IO Word
-- NOTE: The array returned from FFI is valid as long as the buffer is.
-- | Holds positions of the glyph in both horizontal & vertical directions.
-- All positions are relative to current point.
data GlyphPos = GlyphPos {
x_advance :: Int32,
-- ^ How much the line advances after drawing this glyph when setting text
-- in horizontal direction.
y_advance :: Int32,
-- ^ How much the line advances after drawing this glyph when setting text
-- in vertical direction.
x_offset :: Int32,
-- ^ How much the glyph moves on the X-axis before drawing it, this should
-- not effect how much the line advances.
y_offset :: Int32
-- ^ How much the glyph moves on the Y-axis before drawing it, this should
-- not effect how much the line advances.
} deriving (Show, Read, Eq, Generic)
instance Storable GlyphPos where
sizeOf _ = sizeOf (undefined :: Int32) * 5
alignment _ = alignment (undefined :: Int32)
peek ptr = let ptr' = castPtr ptr in do
x_advance' <- ptr' `peekElemOff` 0
y_advance' <- ptr' `peekElemOff` 1
x_offset' <- ptr' `peekElemOff` 2
y_offset' <- ptr' `peekElemOff` 3
return $ GlyphPos x_advance' y_advance' x_offset' y_offset'
poke ptr (GlyphPos x_advance' y_advance' x_offset' y_offset') = do
let ptr' :: Ptr Int32
ptr' = castPtr ptr
pokeElemOff ptr' 0 x_advance'
pokeElemOff ptr' 1 y_advance'
pokeElemOff ptr' 2 x_offset'
pokeElemOff ptr' 3 y_offset'
pokeElemOff ptr' 4 0 -- Zero private field.
-- | Decodes `Buffer'`'s glyph position array.
-- If buffer did not have positions before, they will be initialized to zeros.'
glyphsPos buf' = do
arr <- hb_buffer_get_glyph_positions buf' nullPtr
length <- hb_buffer_get_length buf'
if length == 0 || arr == nullPtr
then return []
else forM [0..fromEnum length - 1] $ peekElemOff arr
foreign import ccall "hb_buffer_get_glyph_positions" hb_buffer_get_glyph_positions
:: Buffer' -> Ptr Word -> IO (Ptr GlyphPos)
-- NOTE: The array returned from FFI is valid as long as the buffer is.
-- | Decodes a `Buffer'` back to corresponding pure-functional `Buffer`.
thawBuffer :: Buffer' -> IO Buffer
thawBuffer buf' = do
glyphInfos' <- glyphInfos buf'
contentType' <- hb_buffer_get_content_type buf'
direction' <- hb_buffer_get_direction buf'
script' <- hb_buffer_get_script buf'
language'' <- hb_buffer_get_language buf'
language' <- peekCString language''
flags' <- hb_buffer_get_flags buf'
clusterLevel' <- hb_buffer_get_cluster_level buf'
invisibleGlyph' <- hb_buffer_get_invisible_glyph buf'
replacementCodepoint' <- hb_buffer_get_replacement_codepoint buf'
return defaultBuffer {
text = Lazy.pack $ Prelude.map (w2c . codepoint) glyphInfos',
contentType = case contentType' of
1 -> Just ContentTypeUnicode
2 -> Just ContentTypeGlyphs
_ -> Nothing,
direction = int2dir direction',
script = Just $ tag_to_string script',
language = Just language',
beginsText = testBit flags' 0, endsText = testBit flags' 1,
preserveDefaultIgnorables = testBit flags' 2,
removeDefaultIgnorables = testBit flags' 3,
don'tInsertDottedCircle = testBit flags' 4,
clusterLevel = case clusterLevel' of
1 -> ClusterMonotoneChars
2 -> ClusterChars
_ -> ClusterMonotoneGraphemes,
invisibleGlyph = w2c invisibleGlyph',
replacementCodepoint = w2c replacementCodepoint'
}
foreign import ccall "hb_buffer_get_content_type" hb_buffer_get_content_type
:: Buffer' -> IO Int
foreign import ccall "hb_buffer_get_direction" hb_buffer_get_direction :: Buffer' -> IO Int
foreign import ccall "hb_buffer_get_script" hb_buffer_get_script :: Buffer' -> IO Word32
foreign import ccall "hb_buffer_get_language" hb_buffer_get_language :: Buffer' -> IO CString
foreign import ccall "hb_buffer_get_flags" hb_buffer_get_flags :: Buffer' -> IO Int
foreign import ccall "hb_buffer_get_cluster_level" hb_buffer_get_cluster_level
:: Buffer' -> IO Int
foreign import ccall "hb_buffer_get_invisible_glyph" hb_buffer_get_invisible_glyph
:: Buffer' -> IO Word32
foreign import ccall "hb_buffer_get_replacement_codepoint" hb_buffer_get_replacement_codepoint
:: Buffer' -> IO Word32