Bug fixes and optimizations

2023-04-29 18:34:20 +02:00 · 2023-04-29 18:34:20 +02:00 · edf2131a2e
commit edf2131a2e
--- a/example.env
+++ b/example.env
@ -1,3 +1,6 @@
+# # # # # # # # # #
+# OpenAI / ChatGPT #
+# # # # # # # # # #
 OPENAI_API_KEY=your-openai-api-key
 OPENAI_MODEL=gpt-3.5-turbo

@ -8,11 +11,21 @@ CHATGPT_TEMPERATURE=1.0
 # Use Whisper transcript from voice message with ChatGPT
 WHISPER_TO_GPT=1

-# TTS Options
+# # # # # # # #
+# TTS Options #
+# # # # # # # #
 ENABLE_TTS=1
-# If USE_TTS=1, you can set the following options
-VOICE_LANGUAGE_LIST=en,es,fr,it,pt,ca
-DEFAULT_VOICE_LANGUAGE=en

+# If ENABLE_TTS=1, you can set the following options
+# Remove any language you don't want to not downlad its voice
+VOICE_LANGUAGE_LIST=en,es,fr,it,pt,ca,no,nl,de
+DEFAULT_VOICE_LANGUAGE=en 
+
+# Do not change this line
+LANGUAGES_H_SR=en,fr,nl,no
+
+# # # # # # # # # #
+# Telegram Options #
+# # # # # # # # # #
 BOT_TOKEN=your-telegram-bot-token
 BOT_ALLOWED_USERS= XXXX,YYYY # Comma separated list of Telegram user IDs
--- a/main.py
+++ b/main.py
@ -1,10 +1,10 @@
+import asyncio
 import logging
 import os
-import tempfile
+import wave
 from functools import wraps
 from io import BytesIO
-
-import subprocess
+from typing import Tuple

 import openai
 from aiogram import Bot, Dispatcher, types
@ -12,8 +12,8 @@ from aiogram.contrib.middlewares.logging import LoggingMiddleware
 from aiogram.types import InlineKeyboardButton, InlineKeyboardMarkup, ParseMode
 from aiogram.utils import executor
 from dotenv import load_dotenv
-from pydub import AudioSegment
 from langdetect import detect
+from pydub import AudioSegment

 import database

@ -46,6 +46,7 @@ WHISPER_TO_GPT = bool(int(os.environ.get("WHISPER_TO_GPT")))
 ENABLE_TTS = bool(int(os.environ.get("ENABLE_TTS")))
 DEFAULT_VOICE_LANGUAGE = os.environ.get("DEFAULT_VOICE_LANGUAGE")
 VOICE_LANGUAGE_LIST = os.environ.get("VOICE_LANGUAGE_LIST")
+LANGUAGES_22050 = os.environ.get("LANGUAGES_H_SR").split(",")

 MAX_USER_CONTEXT = int(os.environ.get("CHATGPT_MAX_USER_CONTEXT"))
 openai.api_key = os.environ.get("OPENAI_API_KEY")
@ -107,7 +108,15 @@ def generate_settings_markup(chat_id: str) -> InlineKeyboardMarkup:
    return InlineKeyboardMarkup(inline_keyboard=keyboard)


-async def text_to_voice(text: str, language: str = None) -> BytesIO:
+async def send_voice_message(chat_id, assistant_message):
+    await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING)
+    audio_data, _ = await text_to_voice(assistant_message)
+    audio_data.seek(0)  # Reset the buffer's position to the beginning
+
+    await bot.send_voice(chat_id, audio_data)
+
+
+async def text_to_voice(text: str, language: str = None) -> Tuple[BytesIO, str]:
    binary_path = "./piper/piper"

    if language is None:
@ -115,34 +124,30 @@ async def text_to_voice(text: str, language: str = None) -> BytesIO:

    model_path = f"./piper/voices/{language}.onnx"

-    # Generate a unique temporary filename with '.ogg' extension
-    with tempfile.NamedTemporaryFile(suffix=".ogg", delete=False) as tmp:
-        tmp_filename = tmp.name
+    text = text.replace("\n", ". ")

-    # Run the binary with the escaped text as input and the temp file as output
-    with open(tmp_filename, "wb") as tmp_file:
-        process = subprocess.Popen(
-            [binary_path, "--model", model_path, "--output_file", "-"],
-            stdin=subprocess.PIPE,
-            stdout=tmp_file,
-            stderr=subprocess.PIPE,
-            text=True,
-            encoding="utf8",
-        )
-        
-        # Remove all newlines from the text so that the text is read as a single sentence
-        text = text.replace("\n", ". ")
-        process.communicate(input=text)
+    cmd = [binary_path, "--model", model_path, "--output_raw"]
+    proc = await asyncio.create_subprocess_exec(
+        *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE
+    )

-    # Open the file in binary mode and read its content into BytesIO object
-    with open(tmp_filename, "rb") as file:
-        bytes_io = BytesIO(file.read())
+    stdout, _ = await proc.communicate(input=text.encode("utf-8"))

-    # Delete the temporary file
-    os.remove(tmp_filename)
+    # Create a new BytesIO object to store the WAV file.
+    wav_file = BytesIO()
+    with wave.open(wav_file, "wb") as wf:
+        # Assuming 1 channel, 16 bits per sample, and 22050 samples per second.
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(22050 if language in LANGUAGES_22050 else 16000)
+        wf.writeframes(stdout)

-    # Return the BytesIO object
-    return bytes_io
+    wav_file.seek(0)
+
+    audio = AudioSegment.from_file(wav_file, format="wav")
+    audio = audio.export(format="ogg", codec="libopus", parameters=["-vbr", "on"])
+
+    return BytesIO(audio.read()), "ogg"


 def restricted(func):
@ -254,7 +259,7 @@ async def usage(message: types.Message) -> None:
 - Generated {user_usage["dalle"]} images with DALL-E.
 - Transcribed {round(float(user_usage["whisper"]) / 60.0, 2)}min with Whisper.

-Total spent: ${user_spent} ({user_percentage:.2f}% of total)
+User total: ${user_spent} ({user_percentage:.2f}% of total)

 Total usage:
 - ChatGPT tokens: {total_usage["chatgpt"]}
@ -297,9 +302,9 @@ async def attachment(message: types.Message):
    await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING)

    transcript = {"text": ""}
-
    audioMessage = False

+    # Handle media types
    if message.voice:
        user_data["usage"]["whisper"] += message.voice.duration
        file_id = message.voice.file_id
@ -317,16 +322,19 @@ async def attachment(message: types.Message):
        await message.reply("Can't handle such file. Reason: unknown.")
        return

+    # Download file from Telegram
    file = await bot.get_file(file_id)
    user_id = message.chat.id
    await file.download(f"{user_id}.{file_format}")

+    # Convert audio to mp3 if needed, because OpenAI doesn't support ogg
    if file_format == "ogg":
        ogg_audio = AudioSegment.from_file(f"{user_id}.ogg", format="ogg")
        ogg_audio.export(f"{user_id}.mp3", format="mp3")
        os.remove(f"{user_id}.ogg")
        file_format = "mp3"

+    # Transcribe audio with OpenAI API
    with open(f"{user_id}.{file_format}", "rb") as audio_file:
        try:
            await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING)
@ -336,25 +344,28 @@ async def attachment(message: types.Message):
            await message.reply("Transcript failed.")
            os.remove(f"{user_id}.{file_format}")
            return
+        os.remove(f"{user_id}.{file_format}")

-    os.remove(f"{user_id}.{file_format}")
-
+    # Handle empty transcript
    if transcript["text"] == "":
        transcript["text"] = "[Silence]"

+    # If whisper_to_chat is enabled, send transcript to ChatGPT and send its response along with the transcript
    chatGPT_response = False
    if audioMessage and user_data["options"]["whisper_to_chat"]:
        chatGPT_response, user_data = await messageGPT(
            transcript["text"], str(chat_id), message.from_user.full_name, user_data
        )
-        transcript["text"] = "> " + transcript["text"] + "\n\n" + chatGPT_response
+        transcript["text"] = ''.join(["> ", transcript["text"], "\n\n", chatGPT_response])

+    # Send transcript (and ChatGPT response if enabled)
    await message.reply(transcript["text"])
+    
+    # Send ChatGPT response as voice message with piper TTS if enabled
    if user_data["options"]["assistant_voice_chat"] and chatGPT_response:
-        await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING)
-        voice_data = await text_to_voice(chatGPT_response)
-        await message.reply_voice(voice_data)
-
+        asyncio.create_task(send_voice_message(chat_id, chatGPT_response))
+        
+    # Update user data
    database.update_user(str(chat_id), user_data)


@ -420,9 +431,7 @@ async def chat(message: types.Message):
    await message.reply(assistant_message, parse_mode=ParseMode.MARKDOWN)

    if user_data["options"]["assistant_voice_chat"]:
-        await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING)
-        voice_data = await text_to_voice(assistant_message)
-        await message.reply_voice(voice_data)
+        asyncio.create_task(send_voice_message(chat_id, assistant_message))


 if __name__ == "__main__":