From f5947f03d9cfb0a400227f4d501166db8c608bf6 Mon Sep 17 00:00:00 2001
From: pluja <codeberg@r3d.red>
Date: Fri, 28 Apr 2023 15:48:52 +0200
Subject: [PATCH] Use piper for realistic TTS, remove google tts

---
 .gitignore          |  9 ++++-
 Dockerfile          | 12 +++++--
 README.md           |  4 +--
 entrypoint.sh       |  7 ++++
 example.env         | 11 +++---
 main.py             | 81 +++++++++++++++++++++------------------------
 piper/init-piper.sh | 81 +++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt    |  3 +-
 utils.py            | 20 +++++++++++
 9 files changed, 174 insertions(+), 54 deletions(-)
 create mode 100644 entrypoint.sh
 create mode 100755 piper/init-piper.sh
 create mode 100644 utils.py

diff --git a/.gitignore b/.gitignore
index 0e7f7e4..822b06e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,11 @@
 venv/
 .env
 *.db
-__pycache__/
\ No newline at end of file
+__pycache__/
+db_data
+*.ogg
+piper/*.so*
+piper/piper
+piper/espeak*
+piper/voices
+MODEL_CARD
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index eb177e4..adc8505 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,16 @@
 FROM python:3.10-slim
 
-RUN apt update && apt install -y ffmpeg libespeak1
+# Set the voice language
+ARG VOICE_LANGUAGE=en
+
+RUN apt update && apt install -y ffmpeg wget libespeak1
 WORKDIR /app
 
+COPY ./entrypoint.sh /app
+RUN chmod +x /app/entrypoint.sh
+
+COPY ./piper /app/piper
+
 COPY ./main.py /app
 COPY ./database.py /app
 COPY ./requirements.txt /app
@@ -11,4 +19,4 @@ RUN mkdir db_data
 
 RUN pip install --upgrade pip
 RUN pip install -r requirements.txt
-CMD [ "python3", "/app/main.py" ]
\ No newline at end of file
+ENTRYPOINT [ "/app/entrypoint.sh" ]
\ No newline at end of file
diff --git a/README.md b/README.md
index 1705fd4..7091235 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ A telegram bot to interact with OpenAI API. You can:
 - Voice chat with ChatGPT:
    - Send voice message.
    - Receive voice messages.
-   - Use GoogleTTS or 100% local Espeak (more robotic).
+   - Use 100% local Text-To-Speech with Language Recognition to give ChatGPT a voice in many languages!
 
 Other features include:
 
@@ -53,7 +53,7 @@ Self hosting this chatbot is pretty easy. You just need to follow this steps:
       - Set your ALLOWED_USERS (comma separated user ids). Set it to `*` to allow all users.
       - Set the SYSTEM_PROMPT for ChatGPT. This is always instructed to ChatGPT as the system.
       - Optional: Edit the MAX_CONTEXT. This variable sets the number of messages that will be sent to ChatGPT API as context for the conversation.
-      - WHISPER_TO_CHAT allows you to choose wether Whisper transcripts should be instructed to ChatGPT or not.
+      - WHISPER_TO_GPT allows you to choose wether Whisper transcripts should be instructed to ChatGPT or not.
          - You can also configure this using `/settings` in chat.
       - ENABLE_GOOGLE_TTS the TTS service will be provided by GoogleTTS, producing more natural voices. If disabled, it fallsback to local voice generation using Espeak.
       - VOICE_LANGUAGE country code for the default voice accent.
diff --git a/entrypoint.sh b/entrypoint.sh
new file mode 100644
index 0000000..c63d2db
--- /dev/null
+++ b/entrypoint.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+echo "Installing piper for text to voice conversion..."
+bash /app/piper/init-piper.sh
+
+echo "Bot starting..."
+python3 -u /app/main.py
\ No newline at end of file
diff --git a/example.env b/example.env
index 8dd7f99..3e7e363 100644
--- a/example.env
+++ b/example.env
@@ -6,10 +6,13 @@ CHATGPT_MAX_USER_CONTEXT=5
 CHATGPT_TEMPERATURE=1.0
 
 # Use Whisper transcript from voice message with ChatGPT
-WHISPER_TO_CHAT=1
-# Use Google TTS for speech to text
-ENABLE_GOOGLE_TTS=0
-VOICE_LANGUAGE=en # en, es, fr, de, it, pt, ru, ja, ko
+WHISPER_TO_GPT=1
+
+# TTS Options
+ENABLE_TTS=1
+# If USE_TTS=1, you can set the following options
+VOICE_LANGUAGE_LIST=en,es,fr,it,pt,ca
+DEFAULT_VOICE_LANGUAGE=en
 
 BOT_TOKEN=your-telegram-bot-token
 BOT_ALLOWED_USERS= XXXX,YYYY # Comma separated list of Telegram user IDs
diff --git a/main.py b/main.py
index b9c0282..7910a74 100644
--- a/main.py
+++ b/main.py
@@ -1,19 +1,19 @@
-import asyncio
 import logging
 import os
 import tempfile
 from functools import wraps
 from io import BytesIO
 
+import subprocess
+
 import openai
-import pyttsx3
 from aiogram import Bot, Dispatcher, types
 from aiogram.contrib.middlewares.logging import LoggingMiddleware
 from aiogram.types import InlineKeyboardButton, InlineKeyboardMarkup, ParseMode
 from aiogram.utils import executor
 from dotenv import load_dotenv
-from gtts import gTTS
 from pydub import AudioSegment
+from langdetect import detect
 
 import database
 
@@ -39,9 +39,13 @@ ALLOWED_USERS = os.environ.get("BOT_ALLOWED_USERS").split(",")
 SYSTEM_PROMPT = os.environ.get("CHATGPT_SYSTEM_PROMPT")
 TEMPERATURE = os.environ.get("CHATGPT_TEMPERATURE")
 MODEL = os.environ.get("OPENAI_MODEL")
-WHISPER_TO_CHAT = bool(int(os.environ.get("WHISPER_TO_CHAT")))
-ENABLE_GOOGLE_TTS = bool(int(os.environ.get("ENABLE_GOOGLE_TTS")))
-VOICE_LANGUAGE = os.environ.get("VOICE_LANGUAGE")
+WHISPER_TO_GPT = bool(int(os.environ.get("WHISPER_TO_GPT")))
+
+# TTS Settings
+ENABLE_TTS = bool(int(os.environ.get("ENABLE_TTS")))
+DEFAULT_VOICE_LANGUAGE = os.environ.get("DEFAULT_VOICE_LANGUAGE")
+VOICE_LANGUAGE_LIST = os.environ.get("VOICE_LANGUAGE_LIST")
+
 MAX_USER_CONTEXT = int(os.environ.get("CHATGPT_MAX_USER_CONTEXT"))
 openai.api_key = os.environ.get("OPENAI_API_KEY")
 
@@ -52,7 +56,7 @@ async def getUserData(chat_id):
             "context": [],
             "usage": {"chatgpt": 0, "whisper": 0, "dalle": 0},
             "options": {
-                "whisper_to_chat": WHISPER_TO_CHAT,
+                "whisper_to_chat": WHISPER_TO_GPT,
                 "assistant_voice_chat": False,
                 "temperature": float(TEMPERATURE),
                 "max-context": MAX_USER_CONTEXT
@@ -83,43 +87,34 @@ def generate_settings_markup(chat_id: str) -> InlineKeyboardMarkup:
     ]
     return InlineKeyboardMarkup(inline_keyboard=keyboard)
 
-def change_voice(engine, gender='male'):
-    for voice in engine.getProperty('voices'):
-        if VOICE_LANGUAGE in voice.languages[0].decode('utf-8') and gender == voice.gender:
-            engine.setProperty('voice', voice.id)
-            return True
-
-async def text_to_voice(text: str) -> BytesIO:
-    with tempfile.NamedTemporaryFile(mode='wb', suffix='.ogg', delete=False) as ogg_file:
-        temp_filename = ogg_file.name
-        voice_done = False
+async def text_to_voice(text: str, language: str = None) -> BytesIO:
+    binary_path = "/home/whoami/PROJECTS/openai-telegram-bot/piper/piper"
+    if language is None:
+        language = detect(text[0:80])
         
-        # If Google TTS is enabled, try to use it first
-        if ENABLE_GOOGLE_TTS:
-            try:
-                tts = gTTS(text, lang=VOICE_LANGUAGE)
-                tts.save(temp_filename)
-                voice_done = True
-            except Exception as e:
-                print("Google TTS failed, falling back to pyttsx3: --> ", e)
-        
-        # If Google TTS is disabled or failed, use pyttsx3
-        if not voice_done:
-            engine = pyttsx3.init()
-            change_voice(engine)
-            engine.setProperty('rate', 160)
-            engine.save_to_file(text, temp_filename)
-            engine.runAndWait()
-            engine.stop()
-            # Add a small delay before reading the file
-            await asyncio.sleep(1)
+    model_path = f"/home/whoami/PROJECTS/openai-telegram-bot/piper/voices/{language}.onnx"
+    # Generate a unique temporary filename with '.ogg' extension
+    with tempfile.NamedTemporaryFile(suffix=".ogg", delete=False) as tmp:
+        tmp_filename = tmp.name
+    
+    text = text.replace('"', "")
+    # Make the text be in a single line
+    text = text.replace("\n", " ")    
+    # Construct the command to execute the binary
+    cmd = f"echo '{text}' | {binary_path} --model {model_path} --output_file {tmp_filename}"
+    
+    # Run the binary and wait for it to finish
+    subprocess.run(cmd, shell=True, check=True)
+    
+    # Open the file in binary mode and read its content into BytesIO object
+    with open(tmp_filename, 'rb') as file:
+        bytes_io = BytesIO(file.read())
 
-    with open(temp_filename, "rb") as audio_file:
-        voice_data = BytesIO(audio_file.read())
-
-    os.remove(temp_filename)
-    voice_data.seek(0)
-    return voice_data
+    # Delete the temporary file
+    os.remove(tmp_filename)
+    
+    # Return the BytesIO object
+    return bytes_io
     
 
 def restricted(func):
@@ -364,7 +359,7 @@ if __name__ == '__main__':
         
     print(f"Allowed users: {ALLOWED_USERS}")
     print(f"System prompt: {SYSTEM_PROMPT}")
-    print(f"Google TTS: {ENABLE_GOOGLE_TTS}")
+    print(f"TTS: {ENABLE_TTS}")
     
     # Register message handler and callback query handler for settings
     dp.register_message_handler(settings, commands=['settings'])
diff --git a/piper/init-piper.sh b/piper/init-piper.sh
new file mode 100755
index 0000000..0914e80
--- /dev/null
+++ b/piper/init-piper.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+source .env
+if [ "$ENABLE_TTS" = 1 ]; then
+    echo "Installing piper for text to voice conversion..."
+    echo "Downloading piper v0.0.2.."
+    wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/piper_amd64.tar.gz
+    echo "Extracting piper"
+    tar -xf piper_amd64.tar.gz
+
+    echo "Installing piper"
+    rm -rf piper_amd64
+    rm piper_amd64.tar.gz
+    chmod -R 777 ./piper/
+    mkdir piper/voices
+
+    echo "Downloading tts voices from VOICE_LANGUAGE_LIST..."
+    echo "This can take a while..."
+    # Check if "en" is in $VOICE_LANGUAGE_LIST and download the english voice from the repo
+    for lang in $(echo $VOICE_LANGUAGE_LIST | tr "," " "); do
+        if [ "$lang" = "en" ] ; then
+            echo "Downloading english voice..."
+            wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-ryan-high.tar.gz
+            tar -xf voice-en-us-ryan-high.tar.gz 
+            mv en-us-ryan-high.onnx en.onnx 
+            mv en-us-ryan-high.onnx.json en.onnx.json
+            rm -rf voice-en-us-ryan-high.tar.gz 
+            echo "Done"
+        fi
+        if [ "$lang" = "es" ] ; then
+            echo "Downloading spanish voice..."
+            wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-es-mls_10246-low.tar.gz
+            tar -xf voice-es-mls_10246-low.tar.gz
+            mv es-mls_10246-low.onnx es.onnx 
+            mv es-mls_10246-low.onnx.json es.onnx.json
+            rm -rf voice-es-mls_10246-low.tar.gz
+            echo "Done"
+        fi
+        if [ "$lang" = "fr" ] ; then
+            echo "Downloading french voice..."
+            wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-fr-siwis-medium.tar.gz
+            tar -xf voice-fr-siwis-medium.tar.gz
+            mv fr-siwis-medium.onnx fr.onnx
+            mv fr-siwis-medium.onnx.json fr.onnx.json
+            rm -rf voice-fr-siwis-medium.tar.gz
+            echo "Done"
+        fi
+        if [ "$lang" = "it" ]; then
+            echo "Downloading italian voice..."
+            wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-it-riccardo_fasol-x-low.tar.gz
+            tar -xf voice-it-riccardo_fasol-x-low.tar.gz
+            mv it-riccardo_fasol-x-low.onnx it.onnx
+            mv it-riccardo_fasol-x-low.onnx.json it.onnx.json
+            rm -rf voice-it-riccardo_fasol-x-low.tar.gz
+            echo "Done"
+        fi
+        if [ "$lang" = "pt" ]; then
+            echo "Downloading portuguese voice..."
+            wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-pt-br-edresson-low.tar.gz
+            tar -xf voice-pt-br-edresson-low.tar.gz
+            mv pt-br-edresson-low.onnx pt.onnx
+            mv pt-br-edresson-low.onnx.json pt.onnx.json
+            rm -rf voice-pt-br-edresson-low.tar.gz
+            echo "Done"
+        fi
+        if [ "$lang" = "ca" ] ; then
+            echo "Downloading catalan voice..."
+            wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-ca-upc_ona-x-low.tar.gz
+            tar -xf voice-ca-upc_ona-x-low.tar.gz 
+            mv ca-upc_ona-x-low.onnx ca.onnx 
+            mv ca-upc_ona-x-low.onnx.json ca.onnx.json
+            rm -rf voice-ca-upc_ona-x-low.tar.gz
+            echo "Done"
+        fi
+    done
+    echo "Moving voices to piper/voices/"
+    mv *.onnx* piper/voices/
+    echo "Done. Piper installed!"
+else
+    echo "TTS Disabled. No work to do..."
+fi
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index e49680d..0068c9b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,5 @@
 aiogram==2.25.1
-gTTS==2.3.1
+langdetect==1.0.9
 openai==0.27.2
 pydub==0.25.1
 python-dotenv==1.0.0
-pyttsx3==2.90
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..a32be5e
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,20 @@
+import subprocess
+import tempfile
+import os
+
+def text_to_speech(text: str) -> str:
+    binary_path = "./piper"
+    model_path = "blizzard_lessac-medium.onnx"
+    
+    # Generate a unique temporary filename
+    with tempfile.NamedTemporaryFile(delete=False) as tmp:
+        tmp_filename = tmp.name
+    
+    # Construct the command to execute the binary
+    cmd = f"echo '{text}' | {binary_path} --model {model_path} --output_file {tmp_filename}"
+    
+    # Run the binary and wait for it to finish
+    subprocess.run(cmd, shell=True, check=True)
+    
+    # Return the temporary filename
+    return tmp_filename