Bug fixes and optimizations

2023-04-29 18:34:20 +02:00 · 2023-04-29 18:34:20 +02:00 · edf2131a2e
commit edf2131a2e
--- a/example.env
+++ b/example.env
@ -1,3 +1,6 @@
 # # # # # # # # # #
 # OpenAI / ChatGPT #
 # # # # # # # # # #
 OPENAI_API_KEY=your-openai-api-key
 OPENAI_MODEL=gpt-3.5-turbo
@ -8,11 +11,21 @@ CHATGPT_TEMPERATURE=1.0
 # Use Whisper transcript from voice message with ChatGPT
 WHISPER_TO_GPT=1
-# TTS Options
+# # # # # # # #
 # TTS Options #
 # # # # # # # #
 ENABLE_TTS=1
 # If USE_TTS=1, you can set the following options
 VOICE_LANGUAGE_LIST=en,es,fr,it,pt,ca
 DEFAULT_VOICE_LANGUAGE=en
 # If ENABLE_TTS=1, you can set the following options
 # Remove any language you don't want to not downlad its voice
 VOICE_LANGUAGE_LIST=en,es,fr,it,pt,ca,no,nl,de
 DEFAULT_VOICE_LANGUAGE=en 
 # Do not change this line
 LANGUAGES_H_SR=en,fr,nl,no
 # # # # # # # # # #
 # Telegram Options #
 # # # # # # # # # #
 BOT_TOKEN=your-telegram-bot-token
 BOT_ALLOWED_USERS= XXXX,YYYY # Comma separated list of Telegram user IDs
--- a/main.py
+++ b/main.py
@ -1,10 +1,10 @@
 import asyncio
 import logging
 import os
-import tempfile
+import wave
 from functools import wraps
 from io import BytesIO
-
+from typing import Tuple
 import subprocess
 import openai
 from aiogram import Bot, Dispatcher, types
@ -12,8 +12,8 @@ from aiogram.contrib.middlewares.logging import LoggingMiddleware
 from aiogram.types import InlineKeyboardButton, InlineKeyboardMarkup, ParseMode
 from aiogram.utils import executor
 from dotenv import load_dotenv
 from pydub import AudioSegment
 from langdetect import detect
 from pydub import AudioSegment
 import database
@ -46,6 +46,7 @@ WHISPER_TO_GPT = bool(int(os.environ.get("WHISPER_TO_GPT")))
 ENABLE_TTS = bool(int(os.environ.get("ENABLE_TTS")))
 DEFAULT_VOICE_LANGUAGE = os.environ.get("DEFAULT_VOICE_LANGUAGE")
 VOICE_LANGUAGE_LIST = os.environ.get("VOICE_LANGUAGE_LIST")
 LANGUAGES_22050 = os.environ.get("LANGUAGES_H_SR").split(",")
 MAX_USER_CONTEXT = int(os.environ.get("CHATGPT_MAX_USER_CONTEXT"))
 openai.api_key = os.environ.get("OPENAI_API_KEY")
@ -107,7 +108,15 @@ def generate_settings_markup(chat_id: str) -> InlineKeyboardMarkup:
    return InlineKeyboardMarkup(inline_keyboard=keyboard)
-async def text_to_voice(text: str, language: str = None) -> BytesIO:
+async def send_voice_message(chat_id, assistant_message):
    await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING)
    audio_data, _ = await text_to_voice(assistant_message)
    audio_data.seek(0)  # Reset the buffer's position to the beginning
    await bot.send_voice(chat_id, audio_data)
 async def text_to_voice(text: str, language: str = None) -> Tuple[BytesIO, str]:
    binary_path = "./piper/piper"
    if language is None:
@ -115,34 +124,30 @@ async def text_to_voice(text: str, language: str = None) -> BytesIO:
    model_path = f"./piper/voices/{language}.onnx"
-    # Generate a unique temporary filename with '.ogg' extension
+    text = text.replace("\n", ". ")
    with tempfile.NamedTemporaryFile(suffix=".ogg", delete=False) as tmp:
        tmp_filename = tmp.name
-    # Run the binary with the escaped text as input and the temp file as output
+    cmd = [binary_path, "--model", model_path, "--output_raw"]
-    with open(tmp_filename, "wb") as tmp_file:
+    proc = await asyncio.create_subprocess_exec(
-        process = subprocess.Popen(
+        *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE
-            [binary_path, "--model", model_path, "--output_file", "-"],
+    )
            stdin=subprocess.PIPE,
            stdout=tmp_file,
            stderr=subprocess.PIPE,
            text=True,
            encoding="utf8",
        )
        # Remove all newlines from the text so that the text is read as a single sentence
        text = text.replace("\n", ". ")
        process.communicate(input=text)
-    # Open the file in binary mode and read its content into BytesIO object
+    stdout, _ = await proc.communicate(input=text.encode("utf-8"))
    with open(tmp_filename, "rb") as file:
        bytes_io = BytesIO(file.read())
-    # Delete the temporary file
+    # Create a new BytesIO object to store the WAV file.
-    os.remove(tmp_filename)
+    wav_file = BytesIO()
    with wave.open(wav_file, "wb") as wf:
        # Assuming 1 channel, 16 bits per sample, and 22050 samples per second.
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(22050 if language in LANGUAGES_22050 else 16000)
        wf.writeframes(stdout)
-    # Return the BytesIO object
+    wav_file.seek(0)
-    return bytes_io
+
    audio = AudioSegment.from_file(wav_file, format="wav")
    audio = audio.export(format="ogg", codec="libopus", parameters=["-vbr", "on"])
    return BytesIO(audio.read()), "ogg"
 def restricted(func):
@ -254,7 +259,7 @@ async def usage(message: types.Message) -> None:
 - Generated {user_usage["dalle"]} images with DALL-E.
 - Transcribed {round(float(user_usage["whisper"]) / 60.0, 2)}min with Whisper.
-Total spent: ${user_spent} ({user_percentage:.2f}% of total)
+User total: ${user_spent} ({user_percentage:.2f}% of total)
 Total usage:
 - ChatGPT tokens: {total_usage["chatgpt"]}
@ -297,9 +302,9 @@ async def attachment(message: types.Message):
    await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING)
    transcript = {"text": ""}
    audioMessage = False
    # Handle media types
    if message.voice:
        user_data["usage"]["whisper"] += message.voice.duration
        file_id = message.voice.file_id
@ -317,16 +322,19 @@ async def attachment(message: types.Message):
        await message.reply("Can't handle such file. Reason: unknown.")
        return
    # Download file from Telegram
    file = await bot.get_file(file_id)
    user_id = message.chat.id
    await file.download(f"{user_id}.{file_format}")
    # Convert audio to mp3 if needed, because OpenAI doesn't support ogg
    if file_format == "ogg":
        ogg_audio = AudioSegment.from_file(f"{user_id}.ogg", format="ogg")
        ogg_audio.export(f"{user_id}.mp3", format="mp3")
        os.remove(f"{user_id}.ogg")
        file_format = "mp3"
    # Transcribe audio with OpenAI API
    with open(f"{user_id}.{file_format}", "rb") as audio_file:
        try:
            await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING)
@ -336,25 +344,28 @@ async def attachment(message: types.Message):
            await message.reply("Transcript failed.")
            os.remove(f"{user_id}.{file_format}")
            return
        os.remove(f"{user_id}.{file_format}")
-    os.remove(f"{user_id}.{file_format}")
+    # Handle empty transcript
    if transcript["text"] == "":
        transcript["text"] = "[Silence]"
    # If whisper_to_chat is enabled, send transcript to ChatGPT and send its response along with the transcript
    chatGPT_response = False
    if audioMessage and user_data["options"]["whisper_to_chat"]:
        chatGPT_response, user_data = await messageGPT(
            transcript["text"], str(chat_id), message.from_user.full_name, user_data
        )
-        transcript["text"] = "> " + transcript["text"] + "\n\n" + chatGPT_response
+        transcript["text"] = ''.join(["> ", transcript["text"], "\n\n", chatGPT_response])
    # Send transcript (and ChatGPT response if enabled)
    await message.reply(transcript["text"])
    # Send ChatGPT response as voice message with piper TTS if enabled
    if user_data["options"]["assistant_voice_chat"] and chatGPT_response:
-        await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING)
+        asyncio.create_task(send_voice_message(chat_id, chatGPT_response))
-        voice_data = await text_to_voice(chatGPT_response)
+        
-        await message.reply_voice(voice_data)
+    # Update user data
    database.update_user(str(chat_id), user_data)
@ -420,9 +431,7 @@ async def chat(message: types.Message):
    await message.reply(assistant_message, parse_mode=ParseMode.MARKDOWN)
    if user_data["options"]["assistant_voice_chat"]:
-        await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING)
+        asyncio.create_task(send_voice_message(chat_id, assistant_message))
        voice_data = await text_to_voice(assistant_message)
        await message.reply_voice(voice_data)
 if __name__ == "__main__":