Video transcription + Reply for context.

2023-03-07 21:06:14 +01:00 · 2023-03-07 21:06:14 +01:00 · c8674d3120
commit c8674d3120
--- a/README.md
+++ b/README.md
@ -6,14 +6,16 @@ A telegram bot to interact with OpenAI API. You can:

 - Generate images with DALL-E: `/imagine`
 - Chat with ChatGPT: Just chat!
- Transcribe audio to text: Just send a voice message!
+- Transcribe audio and video to text: Just send a voice message or a video file!

 Other features include:

 - Clear ChatGPT context history (to save tokens).
+- Reply to any message to use it as context for ChatGPT.
 - Per-user context and usage metrics and spent $.
 - No database, data is saved in-memory.
- Lightweight: just a single python file.
+  - A drawback of this is that data is reset on each docker restart. Will look into solutions for this.
+- Lightweight: a single python file.

 [Jump to selfhosting guide](#self-hosting)

--- a/main.py
+++ b/main.py
@ -2,6 +2,7 @@ import os
 import re
 import openai
 import logging
+import math
 from pydub import AudioSegment
 from telegram import Update
 from functools import wraps
@ -64,8 +65,7 @@ async def imagine(update: Update, context: ContextTypes.DEFAULT_TYPE):
    
@restricted
 async def attachment(update: Update, context: ContextTypes.DEFAULT_TYPE):
-    print(update.message)
-    try:
+    if update.message.voice:
        users[f"{update.effective_chat.id}"]["usage"]['whisper'] += update.message.voice.duration
        file = await context.bot.get_file(update.message.voice.file_id)
        await file.download_to_drive(f"{update.effective_user.id}.ogg")
@ -80,8 +80,21 @@ async def attachment(update: Update, context: ContextTypes.DEFAULT_TYPE):
        os.remove(f"{update.effective_user.id}.mp3")
        if transcript['text'] == "":
            transcript['text'] = "[Silence]"
-        await context.bot.send_message(chat_id=update.effective_chat.id, text=transcript['text'])
-    except:
+        await context.bot.send_message(chat_id=update.effective_chat.id, text=transcript['text'])        
+    elif update.message.video:
+        users[f"{update.effective_chat.id}"]["usage"]['whisper'] += update.message.video.duration
+        file = await context.bot.get_file(update.message.video.file_id)
+        await file.download_to_drive(f"{update.effective_user.id}.mp4")
+        video_file= open(f"{update.effective_user.id}.mp4", "rb")
+        try:
+            transcript = openai.Audio.transcribe("whisper-1", video_file)
+        except:
+            await context.bot.send_message(chat_id=update.effective_chat.id, text="Transcript failed.")
+        os.remove(f"{update.effective_user.id}.mp4")
+        if transcript['text'] == "":
+            transcript['text'] = "[Silence]"
+        await context.bot.send_message(chat_id=update.effective_chat.id, text=transcript['text'])    
+    else:
        await context.bot.send_message(chat_id=update.effective_chat.id, text="Can't handle such file. Reason: unkown.")

@restricted
@ -89,13 +102,20 @@ async def chat(update: Update, context: ContextTypes.DEFAULT_TYPE):
    if not f"{update.effective_chat.id}" in users:
        users[f"{update.effective_chat.id}"] = {"context": [], "usage": {"chatgpt": 0,"whisper": 0,"dalle": 0,}}
    
+    # If replying, add that as context
+    if hasattr(update.message.reply_to_message, "text"):
+        userPrompt = f"In reply to: '{update.message.reply_to_message.text}' \n---\n {update.message.text}"
+    else:
+        userPrompt = update.message.text
+        
    # Save context
    if len(users[f"{update.effective_chat.id}"]["context"]) <= MAX_USER_CONTEXT:
-        users[f"{update.effective_chat.id}"]["context"].append({"role": "user", "content": f"{update.message.text}"})
+        users[f"{update.effective_chat.id}"]["context"].append({"role": "user", "content": f"{userPrompt}"})
    else:
        users[f"{update.effective_chat.id}"]["context"].pop(0)
-        users[f"{update.effective_chat.id}"]["context"].append({"role": "user", "content": f"{update.message.text}"})
+        users[f"{update.effective_chat.id}"]["context"].append({"role": "user", "content": f"{userPrompt}"})
    
+        
    # Interact with ChatGPT api   
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
@ -129,7 +149,7 @@ async def usage(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    total_spent+=(user_info['chatgpt']/750)*0.002
    total_spent+=float(user_info['dalle'])*0.02
    total_spent+=(user_info['whisper']/60.0)*0.006 
-    info_message=f"""User: {update.effective_user.name}\n- Used {user_info["chatgpt"]} characters with ChatGPT.\n- Generated {user_info["dalle"]} images with DALL-E.\n- Transcribed {user_info["whisper"]}min with Whisper.\n\nTotal spent: ${str(total_spent)}"""
+    info_message=f"""User: {update.effective_user.name}\n- Used {user_info["chatgpt"]} characters with ChatGPT.\n- Generated {user_info["dalle"]} images with DALL-E.\n- Transcribed {round(float(user_info["whisper"])/60.0, 2)}min with Whisper.\n\nTotal spent: ${str(total_spent)}"""
    await context.bot.send_message(chat_id=update.effective_chat.id, text=info_message)
    
@restricted