From cb8b67f9026a8a1237a02bf6f1991d1a36a53649 Mon Sep 17 00:00:00 2001
From: Jaro <jaro@argonaut-constellation.org>
Date: Sat, 11 Mar 2023 01:11:53 +0100
Subject: [PATCH] Allow breaking at character bounds.

---
 .golden/czechHelloParagraphNarrow/golden      | 19 ++++++
 .../Text/ParagraphLayout/Internal/Plain.hs    | 67 ++++++++++++++-----
 .../ParagraphLayout/Internal/ResolvedSpan.hs  |  2 +
 .../ParagraphLayout/Internal/BreakSpec.hs     |  4 +-
 test/Data/Text/ParagraphLayoutSpec.hs         |  7 ++
 5 files changed, 81 insertions(+), 18 deletions(-)
 create mode 100644 .golden/czechHelloParagraphNarrow/golden

diff --git a/.golden/czechHelloParagraphNarrow/golden b/.golden/czechHelloParagraphNarrow/golden
new file mode 100644
index 0000000..b282aa0
--- /dev/null
+++ b/.golden/czechHelloParagraphNarrow/golden
@@ -0,0 +1,19 @@
+ParagraphLayout {paragraphRect = Rect {x_origin = 0, y_origin = 0, x_size = 1234, y_size = -5605}, spanLayouts = [
+    SpanLayout [Fragment {fragmentRect = Rect {x_origin = 0, y_origin = 0, x_size = 1234, y_size = -1121}, fragmentPen = (0,-932), fragmentGlyphs =
+        [(GlyphInfo {codepoint = 36, cluster = 5, unsafeToBreak = False, unsafeToConcat = False, safeToInsertTatweel = False},GlyphPos {x_advance = 663, y_advance = 0, x_offset = 0, y_offset = 0}),
+        (GlyphInfo {codepoint = 75, cluster = 6, unsafeToBreak = False, unsafeToConcat = False, safeToInsertTatweel = False},GlyphPos {x_advance = 571, y_advance = 0, x_offset = 0, y_offset = 0})]
+    }, Fragment {fragmentRect = Rect {x_origin = 0, y_origin = -1121, x_size = 1089, y_size = -1121}, fragmentPen = (0,-932), fragmentGlyphs =
+        [(GlyphInfo {codepoint = 82, cluster = 7, unsafeToBreak = False, unsafeToConcat = False, safeToInsertTatweel = False},GlyphPos {x_advance = 590, y_advance = 0, x_offset = 0, y_offset = 0}),
+        (GlyphInfo {codepoint = 77, cluster = 8, unsafeToBreak = False, unsafeToConcat = False, safeToInsertTatweel = False},GlyphPos {x_advance = 253, y_advance = 0, x_offset = 0, y_offset = 0}),
+        (GlyphInfo {codepoint = 15, cluster = 9, unsafeToBreak = False, unsafeToConcat = False, safeToInsertTatweel = False},GlyphPos {x_advance = 246, y_advance = 0, x_offset = 0, y_offset = 0})]
+    }, Fragment {fragmentRect = Rect {x_origin = 0, y_origin = -2242, x_size = 948, y_size = -1121}, fragmentPen = (0,-932), fragmentGlyphs =
+        [(GlyphInfo {codepoint = 86, cluster = 11, unsafeToBreak = False, unsafeToConcat = False, safeToInsertTatweel = False},GlyphPos {x_advance = 446, y_advance = 0, x_offset = 0, y_offset = 0}),
+        (GlyphInfo {codepoint = 89, cluster = 12, unsafeToBreak = False, unsafeToConcat = False, safeToInsertTatweel = False},GlyphPos {x_advance = 502, y_advance = 0, x_offset = 0, y_offset = 0})]
+    }, Fragment {fragmentRect = Rect {x_origin = 0, y_origin = -3363, x_size = 961, y_size = -1121}, fragmentPen = (0,-932), fragmentGlyphs =
+        [(GlyphInfo {codepoint = 246, cluster = 13, unsafeToBreak = False, unsafeToConcat = False, safeToInsertTatweel = False},GlyphPos {x_advance = 559, y_advance = 0, x_offset = 0, y_offset = 0}),
+        (GlyphInfo {codepoint = 87, cluster = 15, unsafeToBreak = False, unsafeToConcat = False, safeToInsertTatweel = False},GlyphPos {x_advance = 402, y_advance = 0, x_offset = 0, y_offset = 0})]
+    }, Fragment {fragmentRect = Rect {x_origin = 0, y_origin = -4484, x_size = 835, y_size = -1121}, fragmentPen = (0,-932), fragmentGlyphs =
+        [(GlyphInfo {codepoint = 72, cluster = 16, unsafeToBreak = False, unsafeToConcat = False, safeToInsertTatweel = False},GlyphPos {x_advance = 559, y_advance = 0, x_offset = 0, y_offset = 0}),
+        (GlyphInfo {codepoint = 4, cluster = 17, unsafeToBreak = False, unsafeToConcat = False, safeToInsertTatweel = False},GlyphPos {x_advance = 276, y_advance = 0, x_offset = 0, y_offset = 0})]
+    }]
+]}
diff --git a/src/Data/Text/ParagraphLayout/Internal/Plain.hs b/src/Data/Text/ParagraphLayout/Internal/Plain.hs
index 53ce413..86e0cdd 100644
--- a/src/Data/Text/ParagraphLayout/Internal/Plain.hs
+++ b/src/Data/Text/ParagraphLayout/Internal/Plain.hs
@@ -24,7 +24,7 @@ import Data.Text.Glyphize
     ,fontExtentsForDir
     ,shape
     )
-import Data.Text.ICU (LocaleName(Locale), breakLine)
+import Data.Text.ICU (LocaleName(Locale), breakCharacter, breakLine)
 import qualified Data.Text.ICU as BreakStatus (Line)
 import Data.Text.Internal (Text(Text))
 import qualified Data.Text.Lazy as Lazy
@@ -160,10 +160,10 @@ layoutAndWrapRunsH maxWidth runs = NonEmpty.head $ validLayouts
         layouts = NonEmpty.map layoutFst splits
         layoutFst (runs1, runs2) = (layout runs1, runs2)
         layout runs1 = layoutRunsH $ trimTextsEnd isEndSpace runs1
-        splits = noSplit :| (wordSplits ++ [lastResortSplit])
+        splits = noSplit :| (lSplits ++ cSplits)
         noSplit = (runs, [])
-        wordSplits = (filter hasContent $ breakSplits [] (reverse runs))
-        lastResortSplit = splitTextsAt8 1 runs
+        lSplits = filter hasContent $ lineSplits runs
+        cSplits = filter hasContent $ characterSplits runs
         hasContent = not . null . fst
 
 -- | The suffix remaining after removing the longest prefix of the list for
@@ -189,24 +189,38 @@ totalAdvances pfs = sum $ map (\(WithSpan _ pf) -> PF.advance pf) pfs
 -- into two on a valid line-breaking boundary, including the start of the first
 -- run and excluding the end of the last run.
 --
--- The first input list is the suffix consisting of runs that have already been
--- considered for breaking. These will be appended to the output suffix as they
--- are.
---
--- The second input list is the prefix consisting of runs to be considered for
--- breaking, in reverse order.
---
 -- The results in the form (prefix, suffix) will be ordered from the longest
 -- prefix to shortest.
-breakSplits :: [WithSpan Run] -> [WithSpan Run] ->
+lineSplits :: [WithSpan Run] -> [([WithSpan Run], [WithSpan Run])]
+lineSplits xs = lineSplits' [] (reverse xs)
+
+lineSplits' :: [WithSpan Run] -> [WithSpan Run] ->
     [([WithSpan Run], [WithSpan Run])]
-breakSplits _ [] = []
-breakSplits closed (x:xs) = splits ++ breakSplits (x:closed) xs
+lineSplits' _ [] = []
+lineSplits' closed (x:xs) = splits ++ lineSplits' (x:closed) xs
     where
         splits = map mapFunc $ runLineSplits x
         mapFunc ((x1, x2), _) =
             (reverse $ collapse $ x1 :| xs, collapse $ x2 :| closed)
 
+-- | Recursive function for finding all possible ways to split a list of runs
+-- into two on a character boundary, including the start of the first run and
+-- excluding the end of the last run.
+--
+-- The results in the form (prefix, suffix) will be ordered from the longest
+-- prefix to shortest.
+characterSplits :: [WithSpan Run] -> [([WithSpan Run], [WithSpan Run])]
+characterSplits xs = characterSplits' [] (reverse xs)
+
+characterSplits' :: [WithSpan Run] -> [WithSpan Run] ->
+    [([WithSpan Run], [WithSpan Run])]
+characterSplits' _ [] = []
+characterSplits' closed (x:xs) = splits ++ characterSplits' (x:closed) xs
+    where
+        splits = map mapFunc $ runCharacterSplits x
+        mapFunc ((x1, x2), _) =
+            (reverse $ collapse $ x1 :| xs, collapse $ x2 :| closed)
+
 -- | Calculate layout for the given horizontal run and attach extra information.
 layoutRunH :: WithSpan Run -> WithSpan PF.ProtoFragment
 layoutRunH (WithSpan rs run) = WithSpan rs pf
@@ -243,7 +257,8 @@ resolveSpans p@(Paragraph arr pStart spans pOpts) = do
         <*> ZipList sStarts
         <*> ZipList sLengths
     let lang = spanLanguage $ spanOptions s
-    let breaks = paragraphLineBreaks p pEnd lang
+    let lBreaks = paragraphLineBreaks p pEnd lang
+    let cBreaks = paragraphCharacterBreaks p pEnd lang
     return RS.ResolvedSpan
         { RS.spanIndex = i
         , RS.spanOffsetInParagraph = sStart - pStart
@@ -252,7 +267,8 @@ resolveSpans p@(Paragraph arr pStart spans pOpts) = do
         , RS.spanFont = paragraphFont pOpts
         , RS.spanLineHeight = paragraphLineHeight pOpts
         , RS.spanLanguage = lang
-        , RS.spanLineBreaks = subOffsetsDesc (sStart - pStart) breaks
+        , RS.spanLineBreaks = subOffsetsDesc (sStart - pStart) lBreaks
+        , RS.spanCharacterBreaks = subOffsetsDesc (sStart - pStart) cBreaks
         }
 
 paragraphLineBreaks :: Paragraph -> Int -> String -> [(Int, BreakStatus.Line)]
@@ -261,6 +277,12 @@ paragraphLineBreaks (Paragraph arr off _ _) end lang =
     where
         paragraphText = Text arr off (end - off)
 
+paragraphCharacterBreaks :: Paragraph -> Int -> String -> [(Int, ())]
+paragraphCharacterBreaks (Paragraph arr off _ _) end lang =
+    breaksDesc (breakCharacter (localeFromLanguage lang)) paragraphText
+    where
+        paragraphText = Text arr off (end - off)
+
 -- | Split the given run at every valid line break position.
 runLineSplits :: WithSpan Run ->
     [((WithSpan Run, WithSpan Run), BreakStatus.Line)]
@@ -275,6 +297,19 @@ runLineBreaks (WithSpan rs run) = dropWhile (not . valid) $
         valid (off, _) = off < runLength
         runLength = lengthWord8 $ getText run
 
+-- | Split the given run before every character.
+runCharacterSplits :: WithSpan Run -> [((WithSpan Run, WithSpan Run), ())]
+runCharacterSplits r = map split $ runCharacterBreaks r
+    where
+        split (i, ()) = (splitTextAt8 i r, ())
+
+runCharacterBreaks :: WithSpan Run -> [(Int, ())]
+runCharacterBreaks (WithSpan rs run) = dropWhile (not . valid) $
+    subOffsetsDesc (runOffsetInSpan run) $ RS.spanCharacterBreaks rs
+    where
+        valid (off, _) = off < runLength
+        runLength = lengthWord8 $ getText run
+
 -- | Predicate for characters that can be potentially removed from the end of
 -- a line according to the CSS Text Module.
 isEndSpace :: Char -> Bool
diff --git a/src/Data/Text/ParagraphLayout/Internal/ResolvedSpan.hs b/src/Data/Text/ParagraphLayout/Internal/ResolvedSpan.hs
index 26e3f1d..48531a8 100644
--- a/src/Data/Text/ParagraphLayout/Internal/ResolvedSpan.hs
+++ b/src/Data/Text/ParagraphLayout/Internal/ResolvedSpan.hs
@@ -22,6 +22,8 @@ data ResolvedSpan = ResolvedSpan
     , spanLineHeight :: LineHeight
     , spanLanguage :: String
     , spanLineBreaks :: [(Int, BreakStatus.Line)]
+    -- TODO: Can be optimised by starting with the shortest line break.
+    , spanCharacterBreaks :: [(Int, ())]
     }
     deriving (Show)
 
diff --git a/test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs b/test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs
index d37289b..544a03f 100644
--- a/test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs
+++ b/test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs
@@ -98,8 +98,8 @@ spec = do
                     ,(0, BreakStatus.Uncategorized)
                     ]
 
-        -- Probably not useful for Balkón;
-        -- HarfBuzz takes care of identifying character clusters for us.
+        -- Useful for breaking inside words for narrow output.
+        -- This can result in breaking ligatures.
         describe "breakCharacter" $ do
             let b lang = breaksDesc $ breakCharacter (Locale lang)
 
diff --git a/test/Data/Text/ParagraphLayoutSpec.hs b/test/Data/Text/ParagraphLayoutSpec.hs
index 2ef5a24..3d36034 100644
--- a/test/Data/Text/ParagraphLayoutSpec.hs
+++ b/test/Data/Text/ParagraphLayoutSpec.hs
@@ -134,6 +134,13 @@ spec = do
                         (opts font)
                         { paragraphLineHeight = Absolute 599 }
                 result `shouldBeGolden` "lineHeightSmaller"
+            it "wraps mid-word when line is narrow" $ \font -> do
+                let
+                    result = layoutPlain $ czechHelloParagraph $
+                        (opts font)
+                        { paragraphMaxWidth = 1300 }
+                result `shouldBeGolden` "czechHelloParagraphNarrow"
+                -- TODO test breaking ligatures
             it "wraps by characters when line is ultra narrow" $ \font -> do
                 let
                     result = layoutPlain $ czechHelloParagraph $
-- 
2.30.2