From 92075f52e26577a9ba3c18ee0b8911214137021b Mon Sep 17 00:00:00 2001 From: Jaro Date: Sun, 11 Jun 2023 21:14:19 +0200 Subject: [PATCH] Internally allow splitting texts at endpoints. This is required in order to preserve a hard line break at the end of the input. --- .../Text/ParagraphLayout/Internal/Layout.hs | 11 +- .../ParagraphLayout/Internal/TextContainer.hs | 8 +- .../Internal/TextContainerSpec.hs | 107 +++++++++++++----- 3 files changed, 89 insertions(+), 37 deletions(-) diff --git a/src/Data/Text/ParagraphLayout/Internal/Layout.hs b/src/Data/Text/ParagraphLayout/Internal/Layout.hs index a1e5674..10b0e4b 100644 --- a/src/Data/Text/ParagraphLayout/Internal/Layout.hs +++ b/src/Data/Text/ParagraphLayout/Internal/Layout.hs @@ -259,9 +259,10 @@ hardSplit runs = allowFstEmpty $ trimFst $ NonEmpty.last $ splits -- TODO: Consider optimising. -- We do not need to look for any line breaks further than the -- shortest hard break. - splits = noSplit :| map allowSndEmpty hSplits + splits = noSplit :| hSplits noSplit = (runs, []) - hSplits = -- from longest to shortest + hSplits = nonEmptyFsts $ + -- from longest to shortest splitTextsBy (map fst . filter isHard . runLineBreaks) runs isHard (_, status) = status == BreakStatus.Hard @@ -284,11 +285,13 @@ softSplits runs = map (allowSndEmpty . trimFst) splits trimFst (runs1, runs2) = (trim runs1, runs2) trim = trimTextsStart isStartSpace . trimTextsEnd isEndSpace splits = lSplits ++ cSplits - lSplits = splitTextsBy (map fst . runLineBreaks) runs + lSplits = nonEmptyPairs $ + splitTextsBy (map fst . runLineBreaks) runs -- TODO: Consider optimising. -- We do not need to look for character breaks further than the -- shortest line break. - cSplits = splitTextsBy (map fst . runCharacterBreaks) runs + cSplits = nonEmptyPairs $ + splitTextsBy (map fst . runCharacterBreaks) runs -- | The suffix remaining after removing the longest prefix of the list for -- which the predicate holds, except always including at least the last element diff --git a/src/Data/Text/ParagraphLayout/Internal/TextContainer.hs b/src/Data/Text/ParagraphLayout/Internal/TextContainer.hs index 42e5827..f926c9e 100644 --- a/src/Data/Text/ParagraphLayout/Internal/TextContainer.hs +++ b/src/Data/Text/ParagraphLayout/Internal/TextContainer.hs @@ -20,8 +20,6 @@ import Data.Text (Text) import qualified Data.Text as Text import Data.Text.Foreign (dropWord8, takeWord8) -import Data.Text.ParagraphLayout.Internal.SplitList - -- | Class of data types containing `Text` that can be accessed. class TextContainer a where -- | Extract a `Text` from its container. @@ -58,16 +56,16 @@ instance SeparableTextContainer Text where dropWhileEnd = Text.dropWhileEnd -- | Treat a list of text containers as a contiguous sequence, --- and find all possible ways to split them into two non-empty lists, +-- and find all possible ways to split them into two lists, -- using the given function to find valid split offsets in `Data.Word.Word8` -- units from the beginning of each container. -- -- The results in the form (prefix, suffix) will be ordered from the longest -- prefix to shortest. splitTextsBy :: (SeparableTextContainer a, Foldable f) => - (a -> [Int]) -> f a -> [(NonEmpty a, NonEmpty a)] + (a -> [Int]) -> f a -> [([a], [a])] splitTextsBy breakFunc tcs = - nonEmptyPairs $ splitTextsBy' breakFunc [] $ reverse $ toList tcs + splitTextsBy' breakFunc [] $ reverse $ toList tcs splitTextsBy' :: SeparableTextContainer a => (a -> [Int]) -> [a] -> [a] -> [([a], [a])] diff --git a/test/Data/Text/ParagraphLayout/Internal/TextContainerSpec.hs b/test/Data/Text/ParagraphLayout/Internal/TextContainerSpec.hs index a9555db..393c0ad 100644 --- a/test/Data/Text/ParagraphLayout/Internal/TextContainerSpec.hs +++ b/test/Data/Text/ParagraphLayout/Internal/TextContainerSpec.hs @@ -1,15 +1,11 @@ module Data.Text.ParagraphLayout.Internal.TextContainerSpec (spec) where -import qualified Data.List.NonEmpty import Data.Text (Text, empty, pack) import Data.Text.Foreign (lengthWord8) import Test.Hspec import Data.Text.ParagraphLayout.Internal.TextContainer -ne :: [a] -> Data.List.NonEmpty.NonEmpty a -ne = Data.List.NonEmpty.fromList - data ExampleContainer = Contain { cText :: Text, cOffset :: Int } deriving (Show, Eq) @@ -43,7 +39,8 @@ exampleBreaks = [ -- Out of bounds. Should not generate any splits. 999, 50, - -- End of last text. Should not generate a split. + -- End of last text. + -- Should only generate a split for end-biased breaks. 43, -- Word and syllable bounds in the second text, -- similar to hyphenation rules. Each should generate a corresponding split. @@ -54,53 +51,107 @@ exampleBreaks = -- Word and syllable bounds in the first text. -- Each should generate a corresponding split. 18, 16, 14, 12, - -- Start of first text. Should not generate a split. + -- Start of first text. + -- Should only generate a split for start-biased breaks. 10, -- Out of bounds. Should not generate any splits. 5, 0, -1 ] -exampleBreakPoints :: ExampleContainer -> [Int] -exampleBreakPoints c = +-- | Bound `exampleBreaks` to the given text container and adjust offsets, +-- including start of text but excluding end of text. +-- +-- Container boundaries should therefore only generate one split, +-- and all splits should have a non-empty prefix. +startBiasedBreakPoints :: ExampleContainer -> [Int] +startBiasedBreakPoints c = dropWhile (>= l) $ takeWhile (>= 0) $ map (subtract d) $ exampleBreaks where l = lengthWord8 $ cText c d = cOffset c +-- | Bound `exampleBreaks` to the given text container and adjust offsets, +-- excluding start of text but including end of text. +-- +-- Container boundaries should therefore only generate one split, +-- and all splits should have a non-empty prefix. +endBiasedBreakPoints :: ExampleContainer -> [Int] +endBiasedBreakPoints c = + dropWhile (> l) $ takeWhile (> 0) $ map (subtract d) $ exampleBreaks + where + l = lengthWord8 $ cText c + d = cOffset c + isSpace :: Char -> Bool isSpace = (== ' ') spec :: Spec spec = do describe "splitTextsBy" $ do - it "splits example text containers" $ do - splitTextsBy exampleBreakPoints exampleContainers `shouldBe` - [ ( ne [ contain "Vikipedija " 10, contain "(Википеди" 21 ] - , ne [ contain "ја)" 38 ] + it "splits example text containers (start bias)" $ do + splitTextsBy startBiasedBreakPoints exampleContainers `shouldBe` + [ ( [ contain "Vikipedija " 10, contain "(Википеди" 21 ] + , [ contain "ја)" 38 ] + ) + , ( [ contain "Vikipedija " 10, contain "(Википе" 21 ] + , [ contain "дија)" 34 ] + ) + , ( [ contain "Vikipedija " 10, contain "(Вики" 21 ] + , [ contain "педија)" 30 ] + ) + , ( [ contain "Vikipedija " 10, contain "(Ви" 21 ] + , [ contain "кипедија)" 26 ] + ) + , ( [ contain "Vikipedija " 10 ] + , [ contain "(Википедија)" 21 ] + ) + , ( [ contain "Vikipedi" 10 ] + , [ contain "ja " 18, contain "(Википедија)" 21 ] + ) + , ( [ contain "Vikipe" 10 ] + , [ contain "dija " 16, contain "(Википедија)" 21 ] + ) + , ( [ contain "Viki" 10 ] + , [ contain "pedija " 14, contain "(Википедија)" 21 ] + ) + , ( [ contain "Vi" 10 ] + , [ contain "kipedija " 12, contain "(Википедија)" 21 ] + ) + , ( [] + , [ contain "Vikipedija " 10, contain "(Википедија)" 21 ] + ) + ] + it "splits example text containers (end bias)" $ do + splitTextsBy endBiasedBreakPoints exampleContainers `shouldBe` + [ ( [ contain "Vikipedija " 10, contain "(Википедија)" 21 ] + , [] + ) + , ( [ contain "Vikipedija " 10, contain "(Википеди" 21 ] + , [ contain "ја)" 38 ] ) - , ( ne [ contain "Vikipedija " 10, contain "(Википе" 21 ] - , ne [ contain "дија)" 34 ] + , ( [ contain "Vikipedija " 10, contain "(Википе" 21 ] + , [ contain "дија)" 34 ] ) - , ( ne [ contain "Vikipedija " 10, contain "(Вики" 21 ] - , ne [ contain "педија)" 30 ] + , ( [ contain "Vikipedija " 10, contain "(Вики" 21 ] + , [ contain "педија)" 30 ] ) - , ( ne [ contain "Vikipedija " 10, contain "(Ви" 21 ] - , ne [ contain "кипедија)" 26 ] + , ( [ contain "Vikipedija " 10, contain "(Ви" 21 ] + , [ contain "кипедија)" 26 ] ) - , ( ne [ contain "Vikipedija " 10 ] - , ne [ contain "(Википедија)" 21 ] + , ( [ contain "Vikipedija " 10 ] + , [ contain "(Википедија)" 21 ] ) - , ( ne [ contain "Vikipedi" 10 ] - , ne [ contain "ja " 18, contain "(Википедија)" 21 ] + , ( [ contain "Vikipedi" 10 ] + , [ contain "ja " 18, contain "(Википедија)" 21 ] ) - , ( ne [ contain "Vikipe" 10 ] - , ne [ contain "dija " 16, contain "(Википедија)" 21 ] + , ( [ contain "Vikipe" 10 ] + , [ contain "dija " 16, contain "(Википедија)" 21 ] ) - , ( ne [ contain "Viki" 10 ] - , ne [ contain "pedija " 14, contain "(Википедија)" 21 ] + , ( [ contain "Viki" 10 ] + , [ contain "pedija " 14, contain "(Википедија)" 21 ] ) - , ( ne [ contain "Vi" 10 ] - , ne [ contain "kipedija " 12, contain "(Википедија)" 21 ] + , ( [ contain "Vi" 10 ] + , [ contain "kipedija " 12, contain "(Википедија)" 21 ] ) ] describe "trimTextsEnd" $ do -- 2.30.2