~jaro/balkon: test/Data/Text/ParagraphLayout/Internal/BreakSpec.hs

module Data.Text.ParagraphLayout.Internal.BreakSpec (spec) where

import Control.Monad (forM_)
import Data.Text (empty, pack, singleton)
import Data.Text.ICU
    ( LocaleName (Locale)
    , breakCharacter
    , breakLine
    , breakSentence
    , breakWord
    )
import qualified Data.Text.ICU as BreakStatus (Line (..), Word (..))

import Test.Hspec
import Data.Text.ParagraphLayout.Internal.Break

spec :: Spec
spec = do

    describe "breaksDesc" $ do

        -- One of the crucial building blocks of a text layout engine.
        describe "breakLine" $ do
            let b lang = breaksDesc $ breakLine (Locale lang)

            it "finds no breaks in empty input" $
                b "en" empty `shouldBe`
                    []

            it "finds break at offset 0 in non-empty input" $
                b "en" (singleton 'a') `shouldBe`
                    [ (0, BreakStatus.Soft)]

            it "finds hard break after newline" $
                b "en" (pack "hello\nworld") `shouldBe`
                    [ (6, BreakStatus.Hard)
                    , (0, BreakStatus.Soft)
                    ]

            it "finds hard break after each of newlines" $
                b "en" (pack "hello\n\nworld") `shouldBe`
                    [ (7, BreakStatus.Hard)
                    , (6, BreakStatus.Hard)
                    , (0, BreakStatus.Soft)
                    ]

            it "finds soft breaks after spaces and tabs" $
                b "en" (pack "a few\twords") `shouldBe`
                    [ (6, BreakStatus.Soft)
                    , (2, BreakStatus.Soft)
                    , (0, BreakStatus.Soft)
                    ]

            it "finds soft breaks after each run of whitespace" $
                b "en" (pack " a  few\t more \n words\n") `shouldBe`
                    [ (16, BreakStatus.Soft)
                    , (15, BreakStatus.Hard)
                    , (9, BreakStatus.Soft)
                    , (4, BreakStatus.Soft)
                    , (1, BreakStatus.Soft)
                    , (0, BreakStatus.Soft)
                    ]

            it "finds soft breaks after spaces and hyphens" $
                b "cs" (pack "následuje stanice Frýdek-Místek") `shouldBe`
                    [ (27, BreakStatus.Soft)
                    , (19, BreakStatus.Soft)
                    , (11, BreakStatus.Soft)
                    , (0, BreakStatus.Soft)
                    ]

            it "finds soft breaks in Japanese kana" $
                b "ja" (pack "トイレはどこですか？") `shouldBe`
                    [ (24, BreakStatus.Soft)
                    , (21, BreakStatus.Soft)
                    , (18, BreakStatus.Soft)
                    , (15, BreakStatus.Soft)
                    , (12, BreakStatus.Soft)
                    , (9, BreakStatus.Soft)
                    , (6, BreakStatus.Soft)
                    , (3, BreakStatus.Soft)
                    , (0, BreakStatus.Soft)
                    ]

            let jaText = pack "五ヶ月‡コード"
            let jaBreaksStrict =
                    [ (18, BreakStatus.Soft)
                    , (12, BreakStatus.Soft)
                    , (9, BreakStatus.Soft)
                    , (6, BreakStatus.Soft)
                    , (0, BreakStatus.Soft)
                    ]
            let jaBreaksLoose =
                    [ (18, BreakStatus.Soft)
                    , (15, BreakStatus.Soft)
                    , (12, BreakStatus.Soft)
                    , (9, BreakStatus.Soft)
                    , (6, BreakStatus.Soft)
                    , (3, BreakStatus.Soft)
                    , (0, BreakStatus.Soft)
                    ]

            -- Observed behaviour.
            -- Not sure why Chinese rules are stricter for Japanese text.
            -- This behaviour may change with future versions of ICU.
            let expectedStrictLocales =
                    [ ""
                    , "en"
                    , "ja@lb=strict"
                    , "zh"
                    , "zh_Hans"
                    , "zh_Hant"
                    , "zxx"
                    , "zxx-any-invalid-suffix"
                    ]
            let expectedLooseLocales =
                    [ "@lb=loose"
                    , "en@lb=loose"
                    , "ja"
                    , "ja_JP"
                    , "ja-JP"
                    , "ja-any-invalid-suffix"
                    , "zh@lb=loose"
                    , "zxx-any-invalid-suffix@lb=loose"
                    ]

            expectedStrictLocales `forM_` \ l ->
                it ("uses strict line breaks for " ++ l ++ " locale") $
                    b l jaText `shouldBe` jaBreaksStrict

            expectedLooseLocales `forM_` \ l ->
                it ("uses loose line breaks for " ++ l ++ " locale") $
                    b l jaText `shouldBe` jaBreaksLoose

        -- Probably not useful for a web browser rendering engine.
        describe "breakSentence" $ do
            let b lang = breaksDesc $ breakSentence (Locale lang)

            it "finds no breaks in empty input" $
                b "en" empty `shouldBe`
                    []

            it "finds break at offset 0 in non-empty input" $
                b "en" (singleton 'a') `shouldBe`
                    [(0, ())]

        -- Probably not useful for a web browser rendering engine,
        -- but may be used for text search and selection.
        describe "breakWord" $ do
            let b lang = breaksDesc $ breakWord (Locale lang)

            it "finds no breaks in empty input" $
                b "en" empty `shouldBe`
                    []

            it "finds break at offset 0 in non-empty input" $
                b "en" (singleton 'a') `shouldBe`
                    [ (0, BreakStatus.Uncategorized)
                    ]

            it "finds breaks after runs of letters and spaces" $
                b "en" (pack "a few   words") `shouldBe`
                    [ (8, BreakStatus.Uncategorized)
                    , (5, BreakStatus.Letter)
                    , (2, BreakStatus.Uncategorized)
                    , (1, BreakStatus.Letter)
                    , (0, BreakStatus.Uncategorized)
                    ]

        -- Useful for breaking inside words for narrow output.
        -- This can result in breaking ligatures.
        describe "breakCharacter" $ do
            let b lang = breaksDesc $ breakCharacter (Locale lang)

            it "finds no breaks in empty input" $
                b "en" empty `shouldBe`
                    []

            it "finds break at offset 0 in non-empty input" $
                b "en" (singleton 'a') `shouldBe`
                    [(0, ())]

    describe "subOffsetsDesc" $ do

        let result = subOffsetsDesc 5 [(11, 'a'), (8, 'b'), (5, 'c'), (2, 'd')]

        it "should reduce offsets" $
            map fst result `shouldBe` [6, 3, 0]

        it "should preserve payload" $
            map snd result `shouldBe` ['a', 'b', 'c']