~alcinnz/rhapsode

eddbb26ad34055ed8547b39bab45d19d8de173c1 — Adrian Cochrane 5 years ago df29030
Output SSML that TTS software like eSpeak can read.
2 files changed, 250 insertions(+), 63 deletions(-)

M src/Main.hs
A src/StyleTree.hs
M src/Main.hs => src/Main.hs +87 -63
@@ 2,6 2,7 @@
module Main where

import System.Environment
import Data.Char (isSpace)

import qualified Network.HTTP.Client as HTTP
import           Network.HTTP.Client.Internal


@@ 11,7 12,8 @@ import qualified Data.ByteString.Lazy.Char8 as C8

import qualified Text.HTML.DOM as HTML
import qualified Text.XML as XML
import           Data.Text as Txt (pack, unpack, Text(..), append)
import           Data.Text as Txt (pack, unpack, Text(..), append,
        words, unwords, head, last, stripStart, stripEnd)

import qualified Data.CSS.Syntax.StyleSheet as CSS
import qualified Data.CSS.Style as Style


@@ 23,6 25,7 @@ import qualified Data.Map as M
import Data.Scientific (toRealFloat)

import DefaultCSS
import StyleTree

main :: IO ()
main = do


@@ 33,19 36,14 @@ main = do
    response <- HTTP.httpLbs request manager
    let html = XML.documentRoot $ HTML.parseLBS $ HTTP.responseBody response
    style <- retreiveStyles html manager request
    C8.putStrLn $ renderElLBS $ stylizeEl style html

renderElLBS el = XML.renderLBS XML.def $ XML.Document {
        XML.documentPrologue = XML.Prologue [] Nothing [],
        XML.documentRoot = el,
        XML.documentEpilogue = []
    }
    let transcript = stylize style html
    C8.putStrLn $ renderElLBS $ styleToSSML transcript

retreiveStyles html manager base = do
    style <- H2C.externalStyles authorStyle testMedia html loadURL
    return style
  where
    emptyStyle :: Style.QueryableStyleSheet MapPropertyParser
    emptyStyle :: Style.QueryableStyleSheet StyleTree
    emptyStyle = Style.queryableStyleSheet
    agentStyle = H2C.cssPriorityAgent emptyStyle `CSS.parse` Txt.pack userAgentCSS
    authorStyle = H2C.internalStyles testMedia agentStyle html


@@ 59,58 57,84 @@ testMedia attrs = media == Nothing || media == Just "speech"
    where media = "media" `M.lookup` attrs


stylizeEl styles html
        | XML.NodeElement el <- H2C.traverseStyles buildEl buildText styles html = el
        |otherwise = XML.Element name M.empty []
stylize styles html = H2C.traverseStyles buildNode buildText styles html
    where
        buildEl (MPP attrs) children = XML.NodeElement $ XML.Element name attrs children
        buildText _ txt = XML.NodeContent txt
        name = XML.Name "prosody" Nothing Nothing

data MapPropertyParser = MPP (M.Map XML.Name Text)
addAttr self name value = Just $ MPP $ M.insert (XML.Name name Nothing Nothing) value self
lowerHz n "khz" absolute = lowerHz (n*1000) "hz" absolute
-- Relative offsets are communicated in SSML by the presence of a sign character.
lowerHz n "hz" True | n < 0 = "0hz"
    | otherwise = Txt.pack (show n ++ "hz")
lowerHz n "hz" False | n < 0 = Txt.pack (show n ++ "hz")
    | otherwise = Txt.pack ('+' : show n ++ "hz")

cssFloat :: CSSTok.NumericValue -> Float
cssFloat (CSSTok.NVInteger i) = fromInteger i
cssFloat (CSSTok.NVNumber n) = toRealFloat n

instance Style.PropertyParser MapPropertyParser where
    temp = MPP M.empty
    inherit _ = Style.temp

    longhand _ (MPP self) "voice-volume" [CSSTok.Ident kw] -- TODO handle offsets
        | kw `elem` ["silent", "x-soft", "soft", "medium", "loud", "x-loud"] = addAttr self "volume" kw
    longhand _ (MPP self) "voice-volume" [CSSTok.Ident "initial"] = addAttr self "volume" "medium"

    longhand _ (MPP self) "voice-rate" [CSSTok.Ident kw] -- TODO handle percentages
        | kw `elem` ["x-slow", "slow", "medium", "fast", "x-fast"] = addAttr self "rate" kw
        | kw `elem` ["initial", "normal"] = addAttr self "rate" "default"

    longhand _ (MPP self) "voice-pitch" [CSSTok.Ident kw] -- TODO handle offsets
        | kw `elem` ["x-low", "low", "medium", "high", "x-high"] = addAttr self "pitch" kw
    longhand _ (MPP self) "voice-pitch" [CSSTok.Ident "initial"] = addAttr self "pitch" "medium"
    longhand _ (MPP self) "voice-pitch" [CSSTok.Dimension _ n unit, CSSTok.Ident "absolute"] =
        addAttr self "pitch" $ lowerHz (cssFloat n) unit True
    longhand _ (MPP self) "voice-pitch" [CSSTok.Ident "absolute", CSSTok.Dimension _ n unit] =
        addAttr self "pitch" $ lowerHz (cssFloat n) unit True
    longhand _ (MPP self) "voice-pitch" [CSSTok.Dimension _ n unit] =
        addAttr self "pitch" $ lowerHz (cssFloat n) unit False

    longhand _ (MPP self) "voice-range" [CSSTok.Ident kw] -- TODO handle offsets
        | kw `elem` ["x-low", "low", "medium", "high", "x-high"] = addAttr self "range" kw
    longhand _ (MPP self) "voice-range" [CSSTok.Ident "initial"] = addAttr self "range" "medium"
    longhand _ (MPP self) "voice-range" [CSSTok.Dimension _ n unit, CSSTok.Ident "absolute"] =
        addAttr self "range" $ lowerHz (cssFloat n) unit True
    longhand _ (MPP self) "voice-range" [CSSTok.Ident "absolute", CSSTok.Dimension _ n unit] =
        addAttr self "range" $ lowerHz (cssFloat n) unit True
    longhand _ (MPP self) "voice-range" [CSSTok.Dimension _ n unit] =
        addAttr self "range" $ lowerHz (cssFloat n) unit False

    longhand _ self _ [CSSTok.Ident "inherit"] = Just self -- Imply the inheritance
    longhand _ _ _ _ = Nothing
        buildNode self children = self {children = children}
        buildText _ txt = Style.temp {content = txt}

styleToSSML StyleTree {speak = False} =
    XML.Element (XML.Name "blank" Nothing Nothing) M.empty []
styleToSSML self =  buildEl0 "prosody" [
            ("volume", volume self),
            ("rate", rate self),
            ("pitch", serializePitch $ pitch self),
            ("range", serializePitch $ range self)
        ] $ buildEl0 "emphasis" [("level", stress self)] $
        buildEl0 "say-as" [("interpret-as", speakAs self)] $
        buildEl0 "tts:style" [
            ("field", maybeBool (punctuation self) "punctuation" "punctuation"),
            ("mode", maybeBool (punctuation self) "all" "none")
        ] $ buildVoices (voice self) (
            XML.NodeContent (content self) :
            map (\node -> XML.NodeElement $ styleToSSML node) (children self)
        )

serializePitch Inherit = Nothing
serializePitch (Pitch kw) = Just kw
serializePitch (Absolute "khz" n) = serializePitch $ Absolute "hz" (1000 * n)
serializePitch (Absolute _ n) | n < 0 = Nothing
    | otherwise = Just $ Txt.pack (show n ++ "hz")
serializePitch (Relative "khz" n) = serializePitch $ Relative "hz" (1000 * n)
serializePitch (Relative _ n) | n < 0 = Just $ Txt.pack (show n ++ "hz")
    | otherwise = Just $ Txt.pack ("+" ++ show n ++ "hz")

maybeBool (Just True) truthy _ = Just truthy
maybeBool (Just False) _ falsy = Just falsy
maybeBool Nothing _ _ = Nothing

maybeCast (Just n) = Just $ Txt.pack $ show n
maybeCast Nothing = Nothing

buildVoice (Voice name) children = buildEl "voice" [("name", Just name)] children
buildVoice (VoicePattern age gender variant) children = buildEl "voice" [
        ("age", maybeCast age),
        ("gender", Just gender),
        ("variant", maybeCast variant)
    ] children

buildVoices Nothing = buildEl "blank" []
buildVoices (Just voice) = buildVoice voice

--------
---- XML Output Utils
--------
buildEl name attrs elChildren =
    XML.Element (XML.Name name Nothing Nothing) (attrsFromList attrs) elChildren
buildEl0 name attrs child = buildEl name attrs [XML.NodeElement child]

attrsFromList ((name, Just value):attrs) =
    M.insert (XML.Name name Nothing Nothing) value $ attrsFromList attrs
attrsFromList ((_, Nothing):attrs) = attrsFromList attrs
attrsFromList [] = M.empty

stripEmptyEls (XML.Element name attrs elChildren) =
    XML.Element name attrs $ stripEmptyNodes elChildren
stripEmptyNodes (XML.NodeElement el@(XML.Element _ attrs elChildren ):nodes)
    | M.null attrs = stripEmptyNodes (elChildren ++ nodes)
    | otherwise = XML.NodeElement (stripEmptyEls el) : stripEmptyNodes nodes
stripEmptyNodes (XML.NodeContent txt : nodes) = -- strip whitespace
    XML.NodeContent (collapseSpaces txt) : stripEmptyNodes nodes
stripEmptyNodes (node:nodes) = stripEmptyNodes nodes
stripEmptyNodes [] = []

collapseSpaces txt
    | txt == "" = "" -- Avoids errors from head/tail tests.
    | isSpace $ Txt.head txt = " " `Txt.append` collapseSpaces (Txt.stripStart txt)
    | isSpace $ Txt.last txt = collapseSpaces (Txt.stripEnd txt) `Txt.append` " "
    | otherwise = Txt.unwords $ Txt.words txt

renderElLBS el = XML.renderLBS XML.def $ XML.Document {
        XML.documentPrologue = XML.Prologue [] Nothing [],
        XML.documentRoot = stripEmptyEls el,
        XML.documentEpilogue = []
    }

A src/StyleTree.hs => src/StyleTree.hs +163 -0
@@ 0,0 1,163 @@
{-# LANGUAGE OverloadedStrings #-}
module StyleTree(
        StyleTree(..), Pitch(..), Voice(..)
    ) where

import Data.CSS.Syntax.Tokens
import Data.Map
import qualified Data.HTML2CSS as H2C
import qualified Data.CSS.Style as Style
import Data.Text
import Data.Scientific (toRealFloat)

data Pitch = Pitch Text | Absolute Text Float | Relative Text Float | Inherit

parsePitch [Ident kw] | kw `elem` ["x-low", "low", "medium", "high", "x-high"] = Just $ Pitch kw
parsePitch [Ident "initial"] = Just $ Pitch "medium"
parsePitch [Dimension _ n unit, Ident "absolute"]
    | unit `elem` ["hz", "khz"] = Just $ Absolute unit $ cssFloat n
parsePitch [Ident "absolute", Dimension _ n unit]
    | unit `elem` ["hz", "khz"] = Just $ Absolute unit $ cssFloat n
parsePitch [Dimension _ n unit]
    | unit `elem` ["hz", "khz"] = Just $ Relative unit $ cssFloat n
parsePitch _ = Nothing

data Voice = Voice Text | VoicePattern (Maybe Integer) Text (Maybe Integer)

genders = ["male", "female", "neutral"]
parseVoice (Ident "child":toks) = parseVoice (Number "6" (NVInteger 6):toks)
parseVoice (Ident "young":toks) = parseVoice (Number "24" (NVInteger 24):toks)
parseVoice (Ident "old":toks) = parseVoice (Number "75" (NVInteger 75):toks)
parseVoice [Ident kw, Number _ (NVInteger v)]
    | v >= 1 && kw `elem` genders = Just $ VoicePattern Nothing kw $ Just v
parseVoice [Ident kw] | kw `elem` genders = Just $ VoicePattern Nothing kw Nothing
parseVoice [Number _ (NVInteger age), Ident kw, Number _ (NVInteger v)]
    | age >= 0 && kw `elem` genders && v >= 1 =
        Just $ VoicePattern (Just age) kw (Just v)
parseVoice [Number _ (NVInteger age), Ident kw]
    | age >= 0 && kw `elem` genders = Just $ VoicePattern (Just age) kw Nothing

data Pause = Pause {
    strength :: Maybe Text,
    time :: Maybe Text
}
parsePause [Ident kw]
    | kw `elem` ["none", "x-weak", "weak", "medium", "strong", "x-strong"] = Just Pause {
        strength = Just kw, time = Nothing
    }
parsePause toks@[Dimension _ _ unit] | unit `elem` ["s", "ms"] = Just Pause {
        strength = Nothing, time = Just $ serialize toks
    }
parsePause _ = Nothing

data Cue = Cue {src :: Text, cueVolume :: Float} | NoCue
parseCue [Url source] = Just $ Cue source 0
parseCue [Url source, Dimension _ n "dB"] = Just $ Cue source $ cssFloat n
parseCue [Ident "none"] = Just NoCue
parseCue _ = Nothing

data StyleTree = StyleTree {
    voice :: Maybe Voice,
    volume :: Maybe Text,
    rate :: Maybe Text,
    pitch :: Pitch,
    range :: Pitch,
    speak :: Bool,
    speakAs :: Maybe Text,
    punctuation :: Maybe Bool,
    stress :: Maybe Text,

    pauseBefore :: Pause,
    pauseAfter :: Pause,
    restBefore :: Pause,
    restAfter :: Pause,

    children :: [StyleTree],
    content :: Text
}

instance Style.PropertyParser StyleTree where
    temp = StyleTree {
        voice = Nothing,
        volume = Nothing,
        rate = Nothing,
        pitch = Inherit,
        range = Inherit,
        speak = True,
        speakAs = Nothing,
        punctuation = Nothing,
        stress = Nothing,

        pauseBefore = Pause Nothing Nothing,
        pauseAfter = Pause Nothing Nothing,
        restBefore = Pause Nothing Nothing,
        restAfter = Pause Nothing Nothing,

        children = [],
        content = ""
    }
    inherit _ = Style.temp

    shorthand _ "pause" [a, b] | Just _ <- parsePause [a], Just _ <- parsePause [b] =
        [("pause-before", [a]), ("pause-after", [b])]
    shorthand _ "pause" v | Just _ <- parsePause v =
        [("pause-before", v), ("pause-after", v)]

    shorthand _ "rest" [a, b] | Just _ <- parsePause [a], Just _ <- parsePause [b] =
        [("rest-before", [a]), ("rest-after", [b])]
    shorthand _ "rest" v | Just _ <- parsePause v =
        [("rest-before", v), ("rest-after", v)]

    --shorthand _ "cue" (a:b@(Dimension _ _ _):c) | Just _ <- parseCue [a, b], Just _ <- parseCue c =
    --    [("cue-before", [a, b]), ("cue-after", c)]
    --shorthand _ "cue" (a:b) | Just _ <- parseCue [a], Just _ <- parseCue b =
    --    [("cue-before", [a]), ("cue-after", c)]
    --shorthand _ "cue" v | Just _ <- parseCue v =
    --    [("cue-before", v), ("cue-after", v)]

    shorthand self key value | Just _ <- Style.longhand self self key value = [(key, value)]
        | otherwise = []

    longhand _ self "voice-volume" [Ident kw] -- TODO handle offsets
        | kw `elem` ["silent", "x-soft", "soft", "medium", "loud", "x-loud"] =
            Just self {volume = Just kw}
    longhand _ self "voice-volume" [Ident "initial"] = Just self {volume = Just "medium"}

    longhand _ self "voice-rate" [Ident kw] -- TODO handle offsets
        | kw `elem` ["x-slow", "slow", "medium", "fast", "x-fast"] = Just self {rate = Just kw}
        | kw `elem` ["initial", "normal"] = Just self {rate = Just "default"}

    longhand _ self "voice-pitch" toks = parsePitch toks >>= \val -> Just self {pitch = val}
    longhand _ self "voice-range" toks = parsePitch toks >>= \val -> Just self {range = val}

    longhand _ self "speak" [Ident "never"] = Just self {speak = False}
    longhand _ self "speak" [Ident kw] | kw `elem` ["always", "initial"] = Just self {speak = True}

    longhand _ self "speak-as" [Ident kw] |
        kw `elem` ["normal", "initial"] = Just self {speakAs = Nothing}
    longhand _ self "speak-as" [Ident "spell-out"] = Just self {speakAs = Just "characters"}
    longhand _ self "speak-as" [Ident "digits"] = Just self {speakAs = Just "tts:digits"}
    longhand _ self "speak-as" [Ident "literal-punctuation"] = Just self {punctuation = Just True}
    longhand _ self "speak-as" [Ident "no-punctuation"] = Just self {punctuation = Just False}

    longhand _ self "voice-family" [Ident "preserve"] = Just self
    longhand _ self "voice-family" toks = parseVoice toks >>= \val -> Just self {voice = Just val}

    longhand _ self "voice-stress" [Ident kw]
        | kw `elem` ["strong", "moderate", "none", "reduced"] = Just self {stress = Just kw}
    longhand _ self "voice-stress" [Ident "normal"] = Just self {stress = Just "moderate"}

    longhand _ self "pause-before" toks = parsePause toks >>= \val -> Just self {pauseBefore = val}
    longhand _ self "pause-after" toks = parsePause toks >>= \val -> Just self {pauseAfter = val}
    longhand _ self "rest-before" toks = parsePause toks >>= \val -> Just self {restBefore = val}
    longhand _ self "rest-after" toks = parsePause toks >>= \val -> Just self {restAfter = val}

    longhand _ self _ [Ident "inherit"] = Just self
    longhand _ _ _ _ = Nothing

--------
---- Helpers
--------
cssFloat :: NumericValue -> Float
cssFloat (NVInteger i) = fromInteger i
cssFloat (NVNumber n) = toRealFloat n