Spaces:

AlserFurma
/

LipSyncAI

Sleeping

App Files Files Community

AlserFurma commited on 28 days ago

Commit

f1b66ce

verified ·

1 Parent(s): ff0ae3b

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -54

app.py CHANGED Viewed

@@ -16,7 +16,6 @@ from pydub import AudioSegment
 # Параметры
 # =========================
 TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
@@ -43,21 +42,18 @@ try:
     )
     print("✅ Все модели успешно загружены!")
 except Exception as e:
     raise RuntimeError(f"❌ Ошибка загрузки моделей: {str(e)}")
 # =========================
 # Вспомогательные функции
 # =========================
 def generate_quiz(text: str):
-    """
-    Генерирует один вопрос и два варианта ответа на основе текста.
     Алгоритмы:
-      1. Базовый: случайное предложение и первые слова.
-      2. Пропуск ключевого слова.
-      3. Вопрос о числе/дате.
     """
     try:
         sentences = [s.strip() for s in text.replace("!", ".").replace("?", ".").split(".") if s.strip()]
@@ -65,9 +61,9 @@ def generate_quiz(text: str):
             raise ValueError("Текст слишком короткий")
         algo = random.choice([1, 2, 3])
         # ------------------------
-        if algo == 1:
-            # Базовый алгоритм
             question_sentence = random.choice(sentences)
             words = question_sentence.split()
             if len(words) <= 3:
@@ -76,14 +72,12 @@ def generate_quiz(text: str):
             else:
                 question = "Что сказано в тексте?"
                 correct_answer = " ".join(words[:6]) + ("..." if len(words) > 6 else "")
             wrong_sentence = random.choice([s for s in sentences if s != question_sentence] or ["Другая информация"])
             wrong_words = wrong_sentence.split()
             wrong_answer = " ".join(wrong_words[:6]) + ("..." if len(wrong_words) > 6 else "")
         # ------------------------
-        elif algo == 2:
-            # Пропуск ключевого слова
             question_sentence = random.choice(sentences)
             words = question_sentence.split()
             if len(words) > 2:
@@ -94,10 +88,9 @@ def generate_quiz(text: str):
             else:
                 # fallback
                 return generate_quiz(text)
         # ------------------------
-        elif algo == 3:
-            # Вопрос о числе или дате
             import re
             question_sentence = random.choice(sentences)
             numbers = re.findall(r'\d+', question_sentence)
@@ -109,26 +102,26 @@ def generate_quiz(text: str):
             else:
                 # fallback к базовому
                 return generate_quiz(text)
         options = [correct_answer, wrong_answer]
         random.shuffle(options)
         return question, options, correct_answer
     except Exception as e:
         raise ValueError(f"Ошибка генерации вопроса: {str(e)}")
 def synthesize_audio(text_ru: str):
     """Переводит русскую строку на казахский, синтезирует аудио и возвращает путь к файлу .wav"""
     translation = translator(text_ru, src_lang="rus_Cyrl", tgt_lang="kaz_Cyrl")
     text_kk = translation[0]["translation_text"]
     inputs = tts_tokenizer(text_kk, return_tensors="pt").to(device)
     with torch.no_grad():
-        output = tts_model(**inputs)
     waveform = output.waveform.squeeze().cpu().numpy()
     audio = (waveform * 32767).astype('int16')
     sampling_rate = getattr(tts_model.config, 'sampling_rate', 22050)
@@ -137,24 +130,22 @@ def synthesize_audio(text_ru: str):
     tmpf.close()
     return tmpf.name
 def concatenate_audio_files(audio_files):
     """Объединяет несколько аудио файлов в один с паузами между ними"""
     combined = AudioSegment.empty()
     pause = AudioSegment.silent(duration=1000)  # 1 секунда паузы
     for i, audio_file in enumerate(audio_files):
         audio = AudioSegment.from_wav(audio_file)
         combined += audio
         if i < len(audio_files) - 1:  # Не добавляем пау��у после последнего файла
             combined += pause
     output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
     combined.export(output_file.name, format='wav')
     output_file.close()
     return output_file.name
 def make_talking_head(image_path: str, audio_path: str, max_retries=3):
     """Вызывает SkyReels/Talking Head space и возвращает путь или URL видео."""
     for attempt in range(max_retries):
@@ -167,7 +158,6 @@ def make_talking_head(image_path: str, audio_path: str, max_retries=3):
                 steps=10,
                 api_name="/process_image_audio"
             )
             print(f"Result type: {type(result)}")
             print(f"Result content: {result}")
@@ -188,7 +178,6 @@ def make_talking_head(image_path: str, audio_path: str, max_retries=3):
                 return result
             else:
                 raise ValueError(f"Unexpected talking head result: {type(result)}, value: {result}")
         except Exception as e:
             if attempt < max_retries - 1:
                 print(f"Попытка {attempt + 1} не удалась: {e}. Повторяю через 2 секунды...")
@@ -196,7 +185,6 @@ def make_talking_head(image_path: str, audio_path: str, max_retries=3):
             else:
                 raise Exception(f"Ошибка после {max_retries} попыток: {str(e)}")
 # =========================
 # Основные обработчики для Gradio
 # =========================
@@ -219,24 +207,24 @@ def start_lesson(image: Image.Image, text: str, state):
         # Создаем три аудио файла
         audio_files = []
         # 1. Текст лекции
         audio1 = synthesize_audio(text)
         audio_files.append(audio1)
         # 2. Вопрос
         question_text = f"А теперь вопрос: {question}"
         audio2 = synthesize_audio(question_text)
         audio_files.append(audio2)
         # 3. Варианты ответа
         options_text = f"Первый вариант: {options[0]}. Второй вариант: {options[1]}"
         audio3 = synthesize_audio(options_text)
         audio_files.append(audio3)
         # Объединяем все аудио в одно
         combined_audio = concatenate_audio_files(audio_files)
         # Создаем одно видео с полным содержанием
         video_path = make_talking_head(image_path, combined_audio)
@@ -250,9 +238,9 @@ def start_lesson(image: Image.Image, text: str, state):
         # Удаляем временные аудио файлы
         for audio_file in audio_files:
-            try:
                 os.remove(audio_file)
-            except:
                 pass
         try:
             os.remove(combined_audio)
@@ -260,25 +248,22 @@ def start_lesson(image: Image.Image, text: str, state):
             pass
         question_display = f"**Вопрос:** {question}"
         return (
-            video_path,
-            question_display,
             gr.update(value=options[0], visible=True),
             gr.update(value=options[1], visible=True),
             state_data
         )
     except Exception as e:
         traceback.print_exc()
         return None, f"❌ Ошибка: {e}", gr.update(visible=False), gr.update(visible=False), state
 def answer_selected(selected_option: str, state):
     """Генерирует реакцию лектора и показывает в том же окне"""
     if not state:
         return None, "❌ Ошибка: отсутствует состояние урока"
     try:
         correct = state.get('correct')
         image_path = state.get('image_path')
@@ -292,22 +277,20 @@ def answer_selected(selected_option: str, state):
         # Создаем аудио с реакцией
         audio_path = synthesize_audio(reaction_ru)
         # Создаем видео с реакцией
         reaction_video = make_talking_head(image_path, audio_path)
-        try:
             os.remove(audio_path)
-        except:
             pass
         return reaction_video, display_message
     except Exception as e:
         traceback.print_exc()
         return None, f"❌ Ошибка: {e}"
 # =========================
 # Gradio UI
 # =========================
@@ -326,8 +309,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         with gr.Column(scale=1):
             inp_image = gr.Image(type='pil', label='📸 Мұғалімнің суреті')
             inp_text = gr.Textbox(
-                lines=5,
-                label='📝 Дәріс мәтіні (орыс.)',
                 placeholder='Дәріс мәт��нін енгізіңіз...',
                 info="Ең көбі 500 таңба"
             )
@@ -336,11 +319,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         with gr.Column(scale=1):
             out_video = gr.Video(label='🎬 Мұғалімнің видеосы')
             out_question = gr.Markdown("")
             with gr.Row():
                 btn_opt1 = gr.Button("Вариант 1", visible=False, size="lg", variant="secondary")
                 btn_opt2 = gr.Button("Вариант 2", visible=False, size="lg", variant="secondary")
             out_result = gr.Markdown("")
     lesson_state = gr.State({})
@@ -356,17 +337,16 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     def handle_answer_1(state):
         option = state.get('options', [''])[0] if state else ''
         return answer_selected(option, state)
     def handle_answer_2(state):
         option = state.get('options', [''])[1] if state and len(state.get('options', [])) > 1 else ''
         return answer_selected(option, state)
     btn_opt1.click(
         fn=handle_answer_1,
         inputs=[lesson_state],
         outputs=[out_video, out_result]
     )
     btn_opt2.click(
         fn=handle_answer_2,
         inputs=[lesson_state],

 # Параметры
 # =========================
 TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
     )
     print("✅ Все модели успешно загружены!")
 except Exception as e:
     raise RuntimeError(f"❌ Ошибка загрузки моделей: {str(e)}")
 # =========================
 # Вспомогательные функции
 # =========================
 def generate_quiz(text: str):
+    """ Генерирует один вопрос и два варианта ответа на основе текста.
     Алгоритмы:
+    1. Базовый: случайное предложение и первые слова.
+    2. Пропуск ключевого слова.
+    3. Вопрос о числе/дате.
     """
     try:
         sentences = [s.strip() for s in text.replace("!", ".").replace("?", ".").split(".") if s.strip()]
             raise ValueError("Текст слишком короткий")
         algo = random.choice([1, 2, 3])
         # ------------------------
+        if algo == 1:  # Базовый алгоритм
             question_sentence = random.choice(sentences)
             words = question_sentence.split()
             if len(words) <= 3:
             else:
                 question = "Что сказано в тексте?"
                 correct_answer = " ".join(words[:6]) + ("..." if len(words) > 6 else "")
             wrong_sentence = random.choice([s for s in sentences if s != question_sentence] or ["Другая информация"])
             wrong_words = wrong_sentence.split()
             wrong_answer = " ".join(wrong_words[:6]) + ("..." if len(wrong_words) > 6 else "")
         # ------------------------
+        elif algo == 2:  # Пропуск ключевого слова
             question_sentence = random.choice(sentences)
             words = question_sentence.split()
             if len(words) > 2:
             else:
                 # fallback
                 return generate_quiz(text)
         # ------------------------
+        elif algo == 3:  # Вопрос о числе или дате
             import re
             question_sentence = random.choice(sentences)
             numbers = re.findall(r'\d+', question_sentence)
             else:
                 # fallback к базовому
                 return generate_quiz(text)
         options = [correct_answer, wrong_answer]
         random.shuffle(options)
         return question, options, correct_answer
     except Exception as e:
         raise ValueError(f"Ошибка генерации вопроса: {str(e)}")
 def synthesize_audio(text_ru: str):
     """Переводит русскую строку на казахский, синтезирует аудио и возвращает путь к файлу .wav"""
     translation = translator(text_ru, src_lang="rus_Cyrl", tgt_lang="kaz_Cyrl")
     text_kk = translation[0]["translation_text"]
     inputs = tts_tokenizer(text_kk, return_tensors="pt").to(device)
     with torch.no_grad():
+        output = tts_model(**inputs, noise_scale=0.7, noise_scale_w=0.9, length_scale=1.2)
     waveform = output.waveform.squeeze().cpu().numpy()
+    waveform /= np.max(np.abs(waveform)) + 1e-8  # Нормализация для лучшего качества
     audio = (waveform * 32767).astype('int16')
     sampling_rate = getattr(tts_model.config, 'sampling_rate', 22050)
     tmpf.close()
     return tmpf.name
 def concatenate_audio_files(audio_files):
     """Объединяет несколько аудио файлов в один с паузами между ними"""
     combined = AudioSegment.empty()
     pause = AudioSegment.silent(duration=1000)  # 1 секунда паузы
     for i, audio_file in enumerate(audio_files):
         audio = AudioSegment.from_wav(audio_file)
         combined += audio
         if i < len(audio_files) - 1:  # Не добавляем пау��у после последнего файла
             combined += pause
     output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
     combined.export(output_file.name, format='wav')
     output_file.close()
     return output_file.name
 def make_talking_head(image_path: str, audio_path: str, max_retries=3):
     """Вызывает SkyReels/Talking Head space и возвращает путь или URL видео."""
     for attempt in range(max_retries):
                 steps=10,
                 api_name="/process_image_audio"
             )
             print(f"Result type: {type(result)}")
             print(f"Result content: {result}")
                 return result
             else:
                 raise ValueError(f"Unexpected talking head result: {type(result)}, value: {result}")
         except Exception as e:
             if attempt < max_retries - 1:
                 print(f"Попытка {attempt + 1} не удалась: {e}. Повторяю через 2 секунды...")
             else:
                 raise Exception(f"Ошибка после {max_retries} попыток: {str(e)}")
 # =========================
 # Основные обработчики для Gradio
 # =========================
         # Создаем три аудио файла
         audio_files = []
         # 1. Текст лекции
         audio1 = synthesize_audio(text)
         audio_files.append(audio1)
         # 2. Вопрос
         question_text = f"А теперь вопрос: {question}"
         audio2 = synthesize_audio(question_text)
         audio_files.append(audio2)
         # 3. Варианты ответа
         options_text = f"Первый вариант: {options[0]}. Второй вариант: {options[1]}"
         audio3 = synthesize_audio(options_text)
         audio_files.append(audio3)
         # Объединяем все аудио в одно
         combined_audio = concatenate_audio_files(audio_files)
         # Создаем одно видео с полным содержанием
         video_path = make_talking_head(image_path, combined_audio)
         # Удаляем временные аудио файлы
         for audio_file in audio_files:
+            try:
                 os.remove(audio_file)
+            except:
                 pass
         try:
             os.remove(combined_audio)
             pass
         question_display = f"**Вопрос:** {question}"
         return (
+            video_path,
+            question_display,
             gr.update(value=options[0], visible=True),
             gr.update(value=options[1], visible=True),
             state_data
         )
     except Exception as e:
         traceback.print_exc()
         return None, f"❌ Ошибка: {e}", gr.update(visible=False), gr.update(visible=False), state
 def answer_selected(selected_option: str, state):
     """Генерирует реакцию лектора и показывает в том же окне"""
     if not state:
         return None, "❌ Ошибка: отсутствует состояние урока"
     try:
         correct = state.get('correct')
         image_path = state.get('image_path')
         # Создаем аудио с реакцией
         audio_path = synthesize_audio(reaction_ru)
         # Создаем видео с реакцией
         reaction_video = make_talking_head(image_path, audio_path)
+        try:
             os.remove(audio_path)
+        except:
             pass
         return reaction_video, display_message
     except Exception as e:
         traceback.print_exc()
         return None, f"❌ Ошибка: {e}"
 # =========================
 # Gradio UI
 # =========================
         with gr.Column(scale=1):
             inp_image = gr.Image(type='pil', label='📸 Мұғалімнің суреті')
             inp_text = gr.Textbox(
+                lines=5,
+                label='📝 Дәріс мәтіні (орыс.)',
                 placeholder='Дәріс мәт��нін енгізіңіз...',
                 info="Ең көбі 500 таңба"
             )
         with gr.Column(scale=1):
             out_video = gr.Video(label='🎬 Мұғалімнің видеосы')
             out_question = gr.Markdown("")
             with gr.Row():
                 btn_opt1 = gr.Button("Вариант 1", visible=False, size="lg", variant="secondary")
                 btn_opt2 = gr.Button("Вариант 2", visible=False, size="lg", variant="secondary")
             out_result = gr.Markdown("")
     lesson_state = gr.State({})
     def handle_answer_1(state):
         option = state.get('options', [''])[0] if state else ''
         return answer_selected(option, state)
     def handle_answer_2(state):
         option = state.get('options', [''])[1] if state and len(state.get('options', [])) > 1 else ''
         return answer_selected(option, state)
     btn_opt1.click(
         fn=handle_answer_1,
         inputs=[lesson_state],
         outputs=[out_video, out_result]
     )
     btn_opt2.click(
         fn=handle_answer_2,
         inputs=[lesson_state],