From eddbb26ad34055ed8547b39bab45d19d8de173c1 Mon Sep 17 00:00:00 2001 From: Adrian Cochrane Date: Sun, 21 Jul 2019 18:59:35 +1200 Subject: [PATCH] Output SSML that TTS software like eSpeak can read. --- src/Main.hs | 150 +++++++++++++++++++++++++------------------ src/StyleTree.hs | 163 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 250 insertions(+), 63 deletions(-) create mode 100644 src/StyleTree.hs diff --git a/src/Main.hs b/src/Main.hs index 0be4ef4..c62aea0 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -2,6 +2,7 @@ module Main where import System.Environment +import Data.Char (isSpace) import qualified Network.HTTP.Client as HTTP import Network.HTTP.Client.Internal @@ -11,7 +12,8 @@ import qualified Data.ByteString.Lazy.Char8 as C8 import qualified Text.HTML.DOM as HTML import qualified Text.XML as XML -import Data.Text as Txt (pack, unpack, Text(..), append) +import Data.Text as Txt (pack, unpack, Text(..), append, + words, unwords, head, last, stripStart, stripEnd) import qualified Data.CSS.Syntax.StyleSheet as CSS import qualified Data.CSS.Style as Style @@ -23,6 +25,7 @@ import qualified Data.Map as M import Data.Scientific (toRealFloat) import DefaultCSS +import StyleTree main :: IO () main = do @@ -33,19 +36,14 @@ main = do response <- HTTP.httpLbs request manager let html = XML.documentRoot $ HTML.parseLBS $ HTTP.responseBody response style <- retreiveStyles html manager request - C8.putStrLn $ renderElLBS $ stylizeEl style html - -renderElLBS el = XML.renderLBS XML.def $ XML.Document { - XML.documentPrologue = XML.Prologue [] Nothing [], - XML.documentRoot = el, - XML.documentEpilogue = [] - } + let transcript = stylize style html + C8.putStrLn $ renderElLBS $ styleToSSML transcript retreiveStyles html manager base = do style <- H2C.externalStyles authorStyle testMedia html loadURL return style where - emptyStyle :: Style.QueryableStyleSheet MapPropertyParser + emptyStyle :: Style.QueryableStyleSheet StyleTree emptyStyle = Style.queryableStyleSheet agentStyle = H2C.cssPriorityAgent emptyStyle `CSS.parse` Txt.pack userAgentCSS authorStyle = H2C.internalStyles testMedia agentStyle html @@ -59,58 +57,84 @@ testMedia attrs = media == Nothing || media == Just "speech" where media = "media" `M.lookup` attrs -stylizeEl styles html - | XML.NodeElement el <- H2C.traverseStyles buildEl buildText styles html = el - |otherwise = XML.Element name M.empty [] +stylize styles html = H2C.traverseStyles buildNode buildText styles html where - buildEl (MPP attrs) children = XML.NodeElement $ XML.Element name attrs children - buildText _ txt = XML.NodeContent txt - name = XML.Name "prosody" Nothing Nothing - -data MapPropertyParser = MPP (M.Map XML.Name Text) -addAttr self name value = Just $ MPP $ M.insert (XML.Name name Nothing Nothing) value self -lowerHz n "khz" absolute = lowerHz (n*1000) "hz" absolute --- Relative offsets are communicated in SSML by the presence of a sign character. -lowerHz n "hz" True | n < 0 = "0hz" - | otherwise = Txt.pack (show n ++ "hz") -lowerHz n "hz" False | n < 0 = Txt.pack (show n ++ "hz") - | otherwise = Txt.pack ('+' : show n ++ "hz") - -cssFloat :: CSSTok.NumericValue -> Float -cssFloat (CSSTok.NVInteger i) = fromInteger i -cssFloat (CSSTok.NVNumber n) = toRealFloat n - -instance Style.PropertyParser MapPropertyParser where - temp = MPP M.empty - inherit _ = Style.temp - - longhand _ (MPP self) "voice-volume" [CSSTok.Ident kw] -- TODO handle offsets - | kw `elem` ["silent", "x-soft", "soft", "medium", "loud", "x-loud"] = addAttr self "volume" kw - longhand _ (MPP self) "voice-volume" [CSSTok.Ident "initial"] = addAttr self "volume" "medium" - - longhand _ (MPP self) "voice-rate" [CSSTok.Ident kw] -- TODO handle percentages - | kw `elem` ["x-slow", "slow", "medium", "fast", "x-fast"] = addAttr self "rate" kw - | kw `elem` ["initial", "normal"] = addAttr self "rate" "default" - - longhand _ (MPP self) "voice-pitch" [CSSTok.Ident kw] -- TODO handle offsets - | kw `elem` ["x-low", "low", "medium", "high", "x-high"] = addAttr self "pitch" kw - longhand _ (MPP self) "voice-pitch" [CSSTok.Ident "initial"] = addAttr self "pitch" "medium" - longhand _ (MPP self) "voice-pitch" [CSSTok.Dimension _ n unit, CSSTok.Ident "absolute"] = - addAttr self "pitch" $ lowerHz (cssFloat n) unit True - longhand _ (MPP self) "voice-pitch" [CSSTok.Ident "absolute", CSSTok.Dimension _ n unit] = - addAttr self "pitch" $ lowerHz (cssFloat n) unit True - longhand _ (MPP self) "voice-pitch" [CSSTok.Dimension _ n unit] = - addAttr self "pitch" $ lowerHz (cssFloat n) unit False - - longhand _ (MPP self) "voice-range" [CSSTok.Ident kw] -- TODO handle offsets - | kw `elem` ["x-low", "low", "medium", "high", "x-high"] = addAttr self "range" kw - longhand _ (MPP self) "voice-range" [CSSTok.Ident "initial"] = addAttr self "range" "medium" - longhand _ (MPP self) "voice-range" [CSSTok.Dimension _ n unit, CSSTok.Ident "absolute"] = - addAttr self "range" $ lowerHz (cssFloat n) unit True - longhand _ (MPP self) "voice-range" [CSSTok.Ident "absolute", CSSTok.Dimension _ n unit] = - addAttr self "range" $ lowerHz (cssFloat n) unit True - longhand _ (MPP self) "voice-range" [CSSTok.Dimension _ n unit] = - addAttr self "range" $ lowerHz (cssFloat n) unit False - - longhand _ self _ [CSSTok.Ident "inherit"] = Just self -- Imply the inheritance - longhand _ _ _ _ = Nothing + buildNode self children = self {children = children} + buildText _ txt = Style.temp {content = txt} + +styleToSSML StyleTree {speak = False} = + XML.Element (XML.Name "blank" Nothing Nothing) M.empty [] +styleToSSML self = buildEl0 "prosody" [ + ("volume", volume self), + ("rate", rate self), + ("pitch", serializePitch $ pitch self), + ("range", serializePitch $ range self) + ] $ buildEl0 "emphasis" [("level", stress self)] $ + buildEl0 "say-as" [("interpret-as", speakAs self)] $ + buildEl0 "tts:style" [ + ("field", maybeBool (punctuation self) "punctuation" "punctuation"), + ("mode", maybeBool (punctuation self) "all" "none") + ] $ buildVoices (voice self) ( + XML.NodeContent (content self) : + map (\node -> XML.NodeElement $ styleToSSML node) (children self) + ) + +serializePitch Inherit = Nothing +serializePitch (Pitch kw) = Just kw +serializePitch (Absolute "khz" n) = serializePitch $ Absolute "hz" (1000 * n) +serializePitch (Absolute _ n) | n < 0 = Nothing + | otherwise = Just $ Txt.pack (show n ++ "hz") +serializePitch (Relative "khz" n) = serializePitch $ Relative "hz" (1000 * n) +serializePitch (Relative _ n) | n < 0 = Just $ Txt.pack (show n ++ "hz") + | otherwise = Just $ Txt.pack ("+" ++ show n ++ "hz") + +maybeBool (Just True) truthy _ = Just truthy +maybeBool (Just False) _ falsy = Just falsy +maybeBool Nothing _ _ = Nothing + +maybeCast (Just n) = Just $ Txt.pack $ show n +maybeCast Nothing = Nothing + +buildVoice (Voice name) children = buildEl "voice" [("name", Just name)] children +buildVoice (VoicePattern age gender variant) children = buildEl "voice" [ + ("age", maybeCast age), + ("gender", Just gender), + ("variant", maybeCast variant) + ] children + +buildVoices Nothing = buildEl "blank" [] +buildVoices (Just voice) = buildVoice voice + +-------- +---- XML Output Utils +-------- +buildEl name attrs elChildren = + XML.Element (XML.Name name Nothing Nothing) (attrsFromList attrs) elChildren +buildEl0 name attrs child = buildEl name attrs [XML.NodeElement child] + +attrsFromList ((name, Just value):attrs) = + M.insert (XML.Name name Nothing Nothing) value $ attrsFromList attrs +attrsFromList ((_, Nothing):attrs) = attrsFromList attrs +attrsFromList [] = M.empty + +stripEmptyEls (XML.Element name attrs elChildren) = + XML.Element name attrs $ stripEmptyNodes elChildren +stripEmptyNodes (XML.NodeElement el@(XML.Element _ attrs elChildren ):nodes) + | M.null attrs = stripEmptyNodes (elChildren ++ nodes) + | otherwise = XML.NodeElement (stripEmptyEls el) : stripEmptyNodes nodes +stripEmptyNodes (XML.NodeContent txt : nodes) = -- strip whitespace + XML.NodeContent (collapseSpaces txt) : stripEmptyNodes nodes +stripEmptyNodes (node:nodes) = stripEmptyNodes nodes +stripEmptyNodes [] = [] + +collapseSpaces txt + | txt == "" = "" -- Avoids errors from head/tail tests. + | isSpace $ Txt.head txt = " " `Txt.append` collapseSpaces (Txt.stripStart txt) + | isSpace $ Txt.last txt = collapseSpaces (Txt.stripEnd txt) `Txt.append` " " + | otherwise = Txt.unwords $ Txt.words txt + +renderElLBS el = XML.renderLBS XML.def $ XML.Document { + XML.documentPrologue = XML.Prologue [] Nothing [], + XML.documentRoot = stripEmptyEls el, + XML.documentEpilogue = [] + } diff --git a/src/StyleTree.hs b/src/StyleTree.hs new file mode 100644 index 0000000..998916f --- /dev/null +++ b/src/StyleTree.hs @@ -0,0 +1,163 @@ +{-# LANGUAGE OverloadedStrings #-} +module StyleTree( + StyleTree(..), Pitch(..), Voice(..) + ) where + +import Data.CSS.Syntax.Tokens +import Data.Map +import qualified Data.HTML2CSS as H2C +import qualified Data.CSS.Style as Style +import Data.Text +import Data.Scientific (toRealFloat) + +data Pitch = Pitch Text | Absolute Text Float | Relative Text Float | Inherit + +parsePitch [Ident kw] | kw `elem` ["x-low", "low", "medium", "high", "x-high"] = Just $ Pitch kw +parsePitch [Ident "initial"] = Just $ Pitch "medium" +parsePitch [Dimension _ n unit, Ident "absolute"] + | unit `elem` ["hz", "khz"] = Just $ Absolute unit $ cssFloat n +parsePitch [Ident "absolute", Dimension _ n unit] + | unit `elem` ["hz", "khz"] = Just $ Absolute unit $ cssFloat n +parsePitch [Dimension _ n unit] + | unit `elem` ["hz", "khz"] = Just $ Relative unit $ cssFloat n +parsePitch _ = Nothing + +data Voice = Voice Text | VoicePattern (Maybe Integer) Text (Maybe Integer) + +genders = ["male", "female", "neutral"] +parseVoice (Ident "child":toks) = parseVoice (Number "6" (NVInteger 6):toks) +parseVoice (Ident "young":toks) = parseVoice (Number "24" (NVInteger 24):toks) +parseVoice (Ident "old":toks) = parseVoice (Number "75" (NVInteger 75):toks) +parseVoice [Ident kw, Number _ (NVInteger v)] + | v >= 1 && kw `elem` genders = Just $ VoicePattern Nothing kw $ Just v +parseVoice [Ident kw] | kw `elem` genders = Just $ VoicePattern Nothing kw Nothing +parseVoice [Number _ (NVInteger age), Ident kw, Number _ (NVInteger v)] + | age >= 0 && kw `elem` genders && v >= 1 = + Just $ VoicePattern (Just age) kw (Just v) +parseVoice [Number _ (NVInteger age), Ident kw] + | age >= 0 && kw `elem` genders = Just $ VoicePattern (Just age) kw Nothing + +data Pause = Pause { + strength :: Maybe Text, + time :: Maybe Text +} +parsePause [Ident kw] + | kw `elem` ["none", "x-weak", "weak", "medium", "strong", "x-strong"] = Just Pause { + strength = Just kw, time = Nothing + } +parsePause toks@[Dimension _ _ unit] | unit `elem` ["s", "ms"] = Just Pause { + strength = Nothing, time = Just $ serialize toks + } +parsePause _ = Nothing + +data Cue = Cue {src :: Text, cueVolume :: Float} | NoCue +parseCue [Url source] = Just $ Cue source 0 +parseCue [Url source, Dimension _ n "dB"] = Just $ Cue source $ cssFloat n +parseCue [Ident "none"] = Just NoCue +parseCue _ = Nothing + +data StyleTree = StyleTree { + voice :: Maybe Voice, + volume :: Maybe Text, + rate :: Maybe Text, + pitch :: Pitch, + range :: Pitch, + speak :: Bool, + speakAs :: Maybe Text, + punctuation :: Maybe Bool, + stress :: Maybe Text, + + pauseBefore :: Pause, + pauseAfter :: Pause, + restBefore :: Pause, + restAfter :: Pause, + + children :: [StyleTree], + content :: Text +} + +instance Style.PropertyParser StyleTree where + temp = StyleTree { + voice = Nothing, + volume = Nothing, + rate = Nothing, + pitch = Inherit, + range = Inherit, + speak = True, + speakAs = Nothing, + punctuation = Nothing, + stress = Nothing, + + pauseBefore = Pause Nothing Nothing, + pauseAfter = Pause Nothing Nothing, + restBefore = Pause Nothing Nothing, + restAfter = Pause Nothing Nothing, + + children = [], + content = "" + } + inherit _ = Style.temp + + shorthand _ "pause" [a, b] | Just _ <- parsePause [a], Just _ <- parsePause [b] = + [("pause-before", [a]), ("pause-after", [b])] + shorthand _ "pause" v | Just _ <- parsePause v = + [("pause-before", v), ("pause-after", v)] + + shorthand _ "rest" [a, b] | Just _ <- parsePause [a], Just _ <- parsePause [b] = + [("rest-before", [a]), ("rest-after", [b])] + shorthand _ "rest" v | Just _ <- parsePause v = + [("rest-before", v), ("rest-after", v)] + + --shorthand _ "cue" (a:b@(Dimension _ _ _):c) | Just _ <- parseCue [a, b], Just _ <- parseCue c = + -- [("cue-before", [a, b]), ("cue-after", c)] + --shorthand _ "cue" (a:b) | Just _ <- parseCue [a], Just _ <- parseCue b = + -- [("cue-before", [a]), ("cue-after", c)] + --shorthand _ "cue" v | Just _ <- parseCue v = + -- [("cue-before", v), ("cue-after", v)] + + shorthand self key value | Just _ <- Style.longhand self self key value = [(key, value)] + | otherwise = [] + + longhand _ self "voice-volume" [Ident kw] -- TODO handle offsets + | kw `elem` ["silent", "x-soft", "soft", "medium", "loud", "x-loud"] = + Just self {volume = Just kw} + longhand _ self "voice-volume" [Ident "initial"] = Just self {volume = Just "medium"} + + longhand _ self "voice-rate" [Ident kw] -- TODO handle offsets + | kw `elem` ["x-slow", "slow", "medium", "fast", "x-fast"] = Just self {rate = Just kw} + | kw `elem` ["initial", "normal"] = Just self {rate = Just "default"} + + longhand _ self "voice-pitch" toks = parsePitch toks >>= \val -> Just self {pitch = val} + longhand _ self "voice-range" toks = parsePitch toks >>= \val -> Just self {range = val} + + longhand _ self "speak" [Ident "never"] = Just self {speak = False} + longhand _ self "speak" [Ident kw] | kw `elem` ["always", "initial"] = Just self {speak = True} + + longhand _ self "speak-as" [Ident kw] | + kw `elem` ["normal", "initial"] = Just self {speakAs = Nothing} + longhand _ self "speak-as" [Ident "spell-out"] = Just self {speakAs = Just "characters"} + longhand _ self "speak-as" [Ident "digits"] = Just self {speakAs = Just "tts:digits"} + longhand _ self "speak-as" [Ident "literal-punctuation"] = Just self {punctuation = Just True} + longhand _ self "speak-as" [Ident "no-punctuation"] = Just self {punctuation = Just False} + + longhand _ self "voice-family" [Ident "preserve"] = Just self + longhand _ self "voice-family" toks = parseVoice toks >>= \val -> Just self {voice = Just val} + + longhand _ self "voice-stress" [Ident kw] + | kw `elem` ["strong", "moderate", "none", "reduced"] = Just self {stress = Just kw} + longhand _ self "voice-stress" [Ident "normal"] = Just self {stress = Just "moderate"} + + longhand _ self "pause-before" toks = parsePause toks >>= \val -> Just self {pauseBefore = val} + longhand _ self "pause-after" toks = parsePause toks >>= \val -> Just self {pauseAfter = val} + longhand _ self "rest-before" toks = parsePause toks >>= \val -> Just self {restBefore = val} + longhand _ self "rest-after" toks = parsePause toks >>= \val -> Just self {restAfter = val} + + longhand _ self _ [Ident "inherit"] = Just self + longhand _ _ _ _ = Nothing + +-------- +---- Helpers +-------- +cssFloat :: NumericValue -> Float +cssFloat (NVInteger i) = fromInteger i +cssFloat (NVNumber n) = toRealFloat n -- 2.30.2