From 2462c039c364112d9015b036fe9217368d7bf2a5 Mon Sep 17 00:00:00 2001
From: Jaro <jaro@argonaut-constellation.org>
Date: Sat, 11 Mar 2023 14:33:04 +0100
Subject: [PATCH] Refactor line breaking.

---
 .../Text/ParagraphLayout/Internal/Plain.hs    |  90 ++--------
 .../ParagraphLayout/Internal/TextContainer.hs |  45 ++---
 .../Internal/TextContainerSpec.hs             | 164 +++++++++---------
 3 files changed, 126 insertions(+), 173 deletions(-)

diff --git a/src/Data/Text/ParagraphLayout/Internal/Plain.hs b/src/Data/Text/ParagraphLayout/Internal/Plain.hs
index 86e0cdd..ad09ba1 100644
--- a/src/Data/Text/ParagraphLayout/Internal/Plain.hs
+++ b/src/Data/Text/ParagraphLayout/Internal/Plain.hs
@@ -24,7 +24,7 @@ import Data.Text.Glyphize
     ,fontExtentsForDir
     ,shape
     )
-import Data.Text.ICU (LocaleName(Locale), breakCharacter, breakLine)
+import Data.Text.ICU (Breaker, LocaleName(Locale), breakCharacter, breakLine)
 import qualified Data.Text.ICU as BreakStatus (Line)
 import Data.Text.Internal (Text(Text))
 import qualified Data.Text.Lazy as Lazy
@@ -162,9 +162,9 @@ layoutAndWrapRunsH maxWidth runs = NonEmpty.head $ validLayouts
         layout runs1 = layoutRunsH $ trimTextsEnd isEndSpace runs1
         splits = noSplit :| (lSplits ++ cSplits)
         noSplit = (runs, [])
-        lSplits = filter hasContent $ lineSplits runs
-        cSplits = filter hasContent $ characterSplits runs
-        hasContent = not . null . fst
+        -- TODO: Use Soft/Hard line break distinction.
+        lSplits = splitTextsBy (map fst . runLineBreaks) runs
+        cSplits = splitTextsBy (map fst . runCharacterBreaks) runs
 
 -- | The suffix remaining after removing the longest prefix of the list for
 -- which the predicate holds, except always including at least the last element
@@ -185,42 +185,6 @@ layoutRunsH runs = map layoutRunH runs
 totalAdvances :: [WithSpan PF.ProtoFragment] -> Int32
 totalAdvances pfs = sum $ map (\(WithSpan _ pf) -> PF.advance pf) pfs
 
--- | Recursive function for finding all possible ways to split a list of runs
--- into two on a valid line-breaking boundary, including the start of the first
--- run and excluding the end of the last run.
---
--- The results in the form (prefix, suffix) will be ordered from the longest
--- prefix to shortest.
-lineSplits :: [WithSpan Run] -> [([WithSpan Run], [WithSpan Run])]
-lineSplits xs = lineSplits' [] (reverse xs)
-
-lineSplits' :: [WithSpan Run] -> [WithSpan Run] ->
-    [([WithSpan Run], [WithSpan Run])]
-lineSplits' _ [] = []
-lineSplits' closed (x:xs) = splits ++ lineSplits' (x:closed) xs
-    where
-        splits = map mapFunc $ runLineSplits x
-        mapFunc ((x1, x2), _) =
-            (reverse $ collapse $ x1 :| xs, collapse $ x2 :| closed)
-
--- | Recursive function for finding all possible ways to split a list of runs
--- into two on a character boundary, including the start of the first run and
--- excluding the end of the last run.
---
--- The results in the form (prefix, suffix) will be ordered from the longest
--- prefix to shortest.
-characterSplits :: [WithSpan Run] -> [([WithSpan Run], [WithSpan Run])]
-characterSplits xs = characterSplits' [] (reverse xs)
-
-characterSplits' :: [WithSpan Run] -> [WithSpan Run] ->
-    [([WithSpan Run], [WithSpan Run])]
-characterSplits' _ [] = []
-characterSplits' closed (x:xs) = splits ++ characterSplits' (x:closed) xs
-    where
-        splits = map mapFunc $ runCharacterSplits x
-        mapFunc ((x1, x2), _) =
-            (reverse $ collapse $ x1 :| xs, collapse $ x2 :| closed)
-
 -- | Calculate layout for the given horizontal run and attach extra information.
 layoutRunH :: WithSpan Run -> WithSpan PF.ProtoFragment
 layoutRunH (WithSpan rs run) = WithSpan rs pf
@@ -257,8 +221,8 @@ resolveSpans p@(Paragraph arr pStart spans pOpts) = do
         <*> ZipList sStarts
         <*> ZipList sLengths
     let lang = spanLanguage $ spanOptions s
-    let lBreaks = paragraphLineBreaks p pEnd lang
-    let cBreaks = paragraphCharacterBreaks p pEnd lang
+    let lBreaks = paragraphBreaks breakLine p pEnd lang
+    let cBreaks = paragraphBreaks breakCharacter p pEnd lang
     return RS.ResolvedSpan
         { RS.spanIndex = i
         , RS.spanOffsetInParagraph = sStart - pStart
@@ -271,41 +235,25 @@ resolveSpans p@(Paragraph arr pStart spans pOpts) = do
         , RS.spanCharacterBreaks = subOffsetsDesc (sStart - pStart) cBreaks
         }
 
-paragraphLineBreaks :: Paragraph -> Int -> String -> [(Int, BreakStatus.Line)]
-paragraphLineBreaks (Paragraph arr off _ _) end lang =
-    breaksDesc (breakLine (localeFromLanguage lang)) paragraphText
+paragraphBreaks :: (LocaleName -> Breaker a) -> Paragraph -> Int -> String ->
+    [(Int, a)]
+paragraphBreaks breakFunc (Paragraph arr off _ _) end lang =
+    breaksDesc (breakFunc (localeFromLanguage lang)) paragraphText
     where
         paragraphText = Text arr off (end - off)
 
-paragraphCharacterBreaks :: Paragraph -> Int -> String -> [(Int, ())]
-paragraphCharacterBreaks (Paragraph arr off _ _) end lang =
-    breaksDesc (breakCharacter (localeFromLanguage lang)) paragraphText
-    where
-        paragraphText = Text arr off (end - off)
-
--- | Split the given run at every valid line break position.
-runLineSplits :: WithSpan Run ->
-    [((WithSpan Run, WithSpan Run), BreakStatus.Line)]
-runLineSplits r = map split $ runLineBreaks r
-    where
-        split (i, status) = (splitTextAt8 i r, status)
-
 runLineBreaks :: WithSpan Run -> [(Int, BreakStatus.Line)]
-runLineBreaks (WithSpan rs run) = dropWhile (not . valid) $
-    subOffsetsDesc (runOffsetInSpan run) $ RS.spanLineBreaks rs
-    where
-        valid (off, _) = off < runLength
-        runLength = lengthWord8 $ getText run
-
--- | Split the given run before every character.
-runCharacterSplits :: WithSpan Run -> [((WithSpan Run, WithSpan Run), ())]
-runCharacterSplits r = map split $ runCharacterBreaks r
-    where
-        split (i, ()) = (splitTextAt8 i r, ())
+runLineBreaks (WithSpan rs run) =
+    runBreaksFromSpan run $ RS.spanLineBreaks rs
 
 runCharacterBreaks :: WithSpan Run -> [(Int, ())]
-runCharacterBreaks (WithSpan rs run) = dropWhile (not . valid) $
-    subOffsetsDesc (runOffsetInSpan run) $ RS.spanCharacterBreaks rs
+runCharacterBreaks (WithSpan rs run) =
+    runBreaksFromSpan run $ RS.spanCharacterBreaks rs
+
+-- | Constrain span breaks to a selected run and adjust offsets.
+runBreaksFromSpan :: Run -> [(Int, a)] -> [(Int, a)]
+runBreaksFromSpan run spanBreaks =
+    dropWhile (not . valid) $ subOffsetsDesc (runOffsetInSpan run) spanBreaks
     where
         valid (off, _) = off < runLength
         runLength = lengthWord8 $ getText run
diff --git a/src/Data/Text/ParagraphLayout/Internal/TextContainer.hs b/src/Data/Text/ParagraphLayout/Internal/TextContainer.hs
index 5ae4299..2cc4737 100644
--- a/src/Data/Text/ParagraphLayout/Internal/TextContainer.hs
+++ b/src/Data/Text/ParagraphLayout/Internal/TextContainer.hs
@@ -1,11 +1,10 @@
 module Data.Text.ParagraphLayout.Internal.TextContainer
     (SeparableTextContainer
     ,TextContainer
-    ,collapse
     ,dropWhileEnd
     ,getText
     ,splitTextAt8
-    ,splitTextsAt8
+    ,splitTextsBy
     ,trimTextsEnd
     )
 where
@@ -13,7 +12,7 @@ where
 import Data.List.NonEmpty (NonEmpty((:|)))
 import Data.Text (Text)
 import qualified Data.Text as Text
-import Data.Text.Foreign (dropWord8, lengthWord8, takeWord8)
+import Data.Text.Foreign (dropWord8, takeWord8)
 
 -- | Class of data types containing `Text` that can be accessed.
 class TextContainer a where
@@ -46,31 +45,35 @@ instance SeparableTextContainer Text where
     dropWhileEnd = Text.dropWhileEnd
 
 -- | Treat a list of text containers as a contiguous sequence,
--- and make a split at the given number of `Data.Word.Word8` from the beginning
--- of this sequence.
+-- and find all possible ways to split them into two non-empty lists,
+-- using the given function to find valit split offsets in `Data.Word.Word8`
+-- units from the beginning of each container.
 --
--- If @n@ falls on a container boundary, the total number of output containers
--- will equal the number of input containers; otherwise, it will be one larger.
-splitTextsAt8 :: SeparableTextContainer a => Int -> [a] -> ([a], [a])
-splitTextsAt8 n rs = (pre, post)
+-- The results in the form (prefix, suffix) will be ordered from the longest
+-- prefix to shortest.
+splitTextsBy :: SeparableTextContainer a => (a -> [Int]) -> [a] -> [([a], [a])]
+splitTextsBy breakFunc tcs =
+    filter notEmpty $ splitTextsBy' breakFunc [] (reverse tcs)
     where
-        pre = reverse rpre
-        (rpre, post) = splitTextsAt8' n [] rs
+        notEmpty (prefix, suffix) = not (null prefix || null suffix)
 
-splitTextsAt8' :: SeparableTextContainer a => Int -> [a] -> [a] -> ([a], [a])
-splitTextsAt8' _ rpre [] = (rpre, [])
-splitTextsAt8' n rpre (r:rs)
-    | n <= 0 = (rpre, r:rs)
-    | n >= l = splitTextsAt8' (n - l) (r:rpre) (rs)
-    | otherwise = let (r1, r2) = splitTextAt8 n r in (r1:rpre, r2:rs)
+splitTextsBy' :: SeparableTextContainer a => (a -> [Int]) -> [a] -> [a] ->
+    [([a], [a])]
+splitTextsBy' _ _ [] = []
+splitTextsBy' breakFunc closed (tc:tcs) =
+    fullSplits ++ splitTextsBy' breakFunc (tc:closed) tcs
     where
-        l = lengthWord8 $ getText r
+        fullSplits = map mergeWithRest tcSplits
+        mergeWithRest (x1, x2) =
+            (reverse $ collapse $ x1 :| tcs, collapse $ x2 :| closed)
+        tcSplits = map (\i -> splitTextAt8 i tc) tcBreakOffsets
+        tcBreakOffsets = breakFunc tc
 
 -- | If the first container in the list is empty, remove it.
 collapse :: SeparableTextContainer a => NonEmpty a -> [a]
-collapse (x :| xs)
-    | Text.null (getText x) = xs
-    | otherwise = x:xs
+collapse (tc :| tcs)
+    | Text.null (getText tc) = tcs
+    | otherwise = tc:tcs
 
 -- | Treat a list of text containers as a contiguous sequence,
 -- and remove a suffix of characters that match the given predicate.
diff --git a/test/Data/Text/ParagraphLayout/Internal/TextContainerSpec.hs b/test/Data/Text/ParagraphLayout/Internal/TextContainerSpec.hs
index 6d3fecc..b7a852a 100644
--- a/test/Data/Text/ParagraphLayout/Internal/TextContainerSpec.hs
+++ b/test/Data/Text/ParagraphLayout/Internal/TextContainerSpec.hs
@@ -1,96 +1,98 @@
 module Data.Text.ParagraphLayout.Internal.TextContainerSpec (spec) where
 
 import Data.Text (Text, empty, pack)
-import Data.Text.Glyphize (Direction(..))
+import Data.Text.Foreign (lengthWord8)
 
 import Test.Hspec
-import Data.Text.ParagraphLayout.Internal.Run
 import Data.Text.ParagraphLayout.Internal.TextContainer
 
-isSpace :: Char -> Bool
-isSpace = (==' ')
+data ExampleContainer = Contain { cText :: Text, cOffset :: Int }
+    deriving (Show, Eq)
+
+contain :: String -> Int -> ExampleContainer
+contain s o = Contain (pack s) o
+
+instance TextContainer ExampleContainer where
+    getText = cText
+
+instance SeparableTextContainer ExampleContainer where
+    splitTextAt8 n (Contain t o) = (Contain t1 o1, Contain t2 o2)
+        where
+            (t1, t2) = splitTextAt8 n t
+            o1 = o
+            o2 = o + lengthWord8 t1
+    dropWhileEnd p (Contain t o) = Contain (dropWhileEnd p t) o
 
-inputRuns :: [Run]
-inputRuns =
-    [ Run
-        -- TODO: We might want both parentheses in the same run.
-        { runOffsetInSpan = 0
-        , runText = pack "Vikipedija ("
-        , runDirection = Just DirLTR
-        , runScript = Just "Latn"
-        }
-    , Run
-        { runOffsetInSpan = 12
-        , runText = pack "Википедија)"
-        , runDirection = Just DirLTR
-        , runScript = Just "Cyrl"
-        }
+exampleContainers :: [ExampleContainer]
+exampleContainers = [c1, c2]
+    where
+        (c1, c2) = splitTextAt8 11 $ contain "Vikipedija (Википедија)" 10
+
+exampleBreaks :: [Int]
+exampleBreaks =
+    [
+    -- Out of bounds. Should not generate any splits.
+    999, 50,
+    -- End of last text. Should not generate a split.
+    43,
+    -- Word and syllable bounds in the second text,
+    -- similar to hyphenation rules. Each should generate a corresponding split.
+    38, 34, 30, 26,
+    -- The exact edge between the two texts.
+    -- Should generate a split, but not any empty containers.
+    21,
+    -- Word and syllable bounds in the first text.
+    -- Each should generate a corresponding split.
+    18, 16, 14, 12,
+    -- Start of first text. Should not generate a split.
+    10,
+    -- Out of bounds. Should not generate any splits.
+    5, 0, -1
     ]
 
+exampleBreakPoints :: ExampleContainer -> [Int]
+exampleBreakPoints c =
+    dropWhile (>= l) $ takeWhile (>= 0) $ map (subtract d) $ exampleBreaks
+    where
+        l = lengthWord8 $ cText c
+        d = cOffset c
+
+isSpace :: Char -> Bool
+isSpace = (==' ')
+
 spec :: Spec
 spec = do
-    describe "splitTextsAt8" $ do
-        it "negative value splits at beginning of first run" $ do
-            splitTextsAt8 (-1) inputRuns `shouldBe` ([], inputRuns)
-        it "zero splits at beginning of first run" $ do
-            splitTextsAt8 0 inputRuns `shouldBe` ([], inputRuns)
-        it "splits in first run" $ do
-            splitTextsAt8 11 inputRuns `shouldBe`
-                (
-                    [ Run
-                        { runOffsetInSpan = 0
-                        , runText = pack "Vikipedija "
-                        , runDirection = Just DirLTR
-                        , runScript = Just "Latn"
-                        }
-                    ]
-                ,
-                    [ Run
-                        { runOffsetInSpan = 11
-                        , runText = pack "("
-                        , runDirection = Just DirLTR
-                        , runScript = Just "Latn"
-                        }
-                    , Run
-                        { runOffsetInSpan = 12
-                        , runText = pack "Википедија)"
-                        , runDirection = Just DirLTR
-                        , runScript = Just "Cyrl"
-                        }
-                    ]
-                )
-        it "split at run edges does not generate extra run" $ do
-            splitTextsAt8 12 inputRuns `shouldBe`
-                (take 1 inputRuns, drop 1 inputRuns)
-        it "splits in second run" $ do
-            splitTextsAt8 20 inputRuns `shouldBe`
-                (
-                    [ Run
-                        { runOffsetInSpan = 0
-                        , runText = pack "Vikipedija ("
-                        , runDirection = Just DirLTR
-                        , runScript = Just "Latn"
-                        }
-                    , Run
-                        { runOffsetInSpan = 12
-                        , runText = pack "Вики"
-                        , runDirection = Just DirLTR
-                        , runScript = Just "Cyrl"
-                        }
-                    ]
-                ,
-                    [ Run
-                        { runOffsetInSpan = 20
-                        , runText = pack "педија)"
-                        , runDirection = Just DirLTR
-                        , runScript = Just "Cyrl"
-                        }
-                    ]
-                )
-        it "split at end does not generate extra run" $ do
-            splitTextsAt8 33 inputRuns `shouldBe` (inputRuns, [])
-        it "large value splits at end of last run" $ do
-            splitTextsAt8 999 inputRuns `shouldBe` (inputRuns, [])
+    describe "splitTextsBy" $ do
+        it "splits example text containers" $ do
+            splitTextsBy exampleBreakPoints exampleContainers `shouldBe`
+                [ ( [ contain "Vikipedija " 10, contain "(Википеди" 21 ]
+                  , [ contain "ја)" 38 ]
+                  )
+                , ( [ contain "Vikipedija " 10, contain "(Википе" 21 ]
+                  , [ contain "дија)" 34 ]
+                  )
+                , ( [ contain "Vikipedija " 10, contain "(Вики" 21 ]
+                  , [ contain "педија)" 30 ]
+                  )
+                , ( [ contain "Vikipedija " 10, contain "(Ви" 21 ]
+                  , [ contain "кипедија)" 26 ]
+                  )
+                , ( [ contain "Vikipedija " 10 ]
+                  , [ contain "(Википедија)" 21 ]
+                  )
+                , ( [ contain "Vikipedi" 10 ]
+                  , [ contain "ja " 18, contain "(Википедија)" 21 ]
+                  )
+                , ( [ contain "Vikipe" 10 ]
+                  , [ contain "dija " 16, contain "(Википедија)" 21 ]
+                  )
+                , ( [ contain "Viki" 10 ]
+                  , [ contain "pedija " 14, contain "(Википедија)" 21 ]
+                  )
+                , ( [ contain "Vi" 10 ]
+                  , [ contain "kipedija " 12, contain "(Википедија)" 21 ]
+                  )
+                ]
     describe "trimTextsEnd" $ do
         describe "isSpace" $ do
             it "does nothing on an empty list" $ do
-- 
2.30.2