tts-voices / enhance.py
Václav Volhejn
Enhance Ears and VCTK datasets
a0de156
raw
history blame
8.27 kB
"""Enhance audio files using AI-Coustics API."""
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "librosa",
# "requests",
# "soundfile",
# "tqdm",
# ]
# ///
import json
import os
import time
from pathlib import Path
from typing import List, Optional
import librosa
import requests
import soundfile as sf
import tqdm
class AiCousticsEnhancer:
"""Client for AI-Coustics audio enhancement API"""
def __init__(self, api_key: str):
"""
Initialize the AI-Coustics API client
Args:
api_key: Your AI-Coustics API key
"""
self.api_key = api_key
self.base_url = "https://api.ai-coustics.io/v2"
self.headers = {"X-API-Key": self.api_key}
def upload_audio(
self,
file_path: str,
enhancement_level: int = 100,
enhancement_model: str = "LARK_V2",
loudness_target: int = -19,
true_peak: int = -1,
transcode: str = "WAV",
) -> dict:
"""
Upload an audio file for enhancement
Args:
file_path: Path to the audio file
enhancement_level: Enhancement strength (0-100)
enhancement_model: Model to use (LARK_V2 or FINCH_V2)
loudness_target: Target loudness in LUFS
true_peak: True peak level in dBFS
transcode: Output format (WAV, MP3, etc.)
Returns:
Response dictionary with uid and metadata
"""
url = f"{self.base_url}/medias"
# Prepare the enhancement parameters
media_enhancement = {
"loudness_target": loudness_target,
"true_peak": true_peak,
"enhancement_level": enhancement_level,
"enhancement_model": enhancement_model,
"transcode": transcode,
}
# Upload file
with open(file_path, "rb") as f:
files = {"file": f}
data = {"media_enhancement": json.dumps(media_enhancement)}
response = requests.post(url, headers=self.headers, files=files, data=data)
response.raise_for_status()
return response.json()
def check_status(self, uid: str) -> dict:
"""
Check the processing status of an uploaded media file
Args:
uid: Unique identifier returned from upload
Returns:
Metadata dictionary with current status
"""
url = f"{self.base_url}/medias/{uid}/metadata"
response = requests.get(url, headers=self.headers)
response.raise_for_status()
return response.json()
def wait_for_completion(
self, uid: str, poll_interval: int = 2, timeout: int = 300
) -> dict:
"""
Poll the API until processing is complete
Args:
uid: Unique identifier returned from upload
poll_interval: Seconds between status checks
timeout: Maximum seconds to wait
Returns:
Final metadata dictionary
"""
start_time = time.time()
while time.time() - start_time < timeout:
metadata = self.check_status(uid)
status = metadata.get("enhancement_status")
print(f"Status: {status}")
if status == "COMPLETED":
return metadata
elif status == "FAILED":
raise Exception(f"Enhancement failed: {metadata}")
time.sleep(poll_interval)
raise TimeoutError(f"Processing did not complete within {timeout} seconds")
def download_enhanced(self, uid: str, output_path: str):
"""
Download the enhanced audio file
Args:
uid: Unique identifier returned from upload
output_path: Path where to save the enhanced file
"""
url = f"{self.base_url}/medias/{uid}/file"
response = requests.get(url, headers=self.headers, stream=True)
response.raise_for_status()
# Save the file
with open(output_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Enhanced audio saved to: {output_path}")
def get_enhanced_output_path(input_path: Path) -> Path:
"""Generate output path by adding '_enhanced' suffix to the name."""
return input_path.with_stem(input_path.stem + "_enhanced")
def enhance_audio_files(
input_paths: List[Path],
api_key: str,
enhancement_level: int = 90,
enhancement_model: str = "LARK_V2",
) -> List[Path]:
"""
Process a list of audio files through AI-Coustics API.
Args:
input_paths: List of paths to audio files
api_key: Your AI-Coustics API key
enhancement_level: Enhancement strength (0-100)
enhancement_model: Model to use (LARK_V2 or FINCH_V2)
Returns:
List of paths to enhanced audio files
"""
client = AiCousticsEnhancer(api_key)
enhanced_files = []
valid_input_paths = [
p for p in input_paths
if p.exists() and not get_enhanced_output_path(p).exists()
]
for input_path in tqdm.tqdm(valid_input_paths):
output_path = get_enhanced_output_path(input_path)
if output_path.exists():
print(f"Found {output_path}, skipping")
continue
output_path.parent.mkdir(parents=True, exist_ok=True)
try:
print("Uploading to AI-Coustics...")
result = client.upload_audio(
str(input_path),
enhancement_level=enhancement_level,
enhancement_model=enhancement_model,
)
uid = result["uid"]
print(f"Uploaded successfully. UID: {uid}")
print("Waiting for enhancement to complete...")
client.wait_for_completion(uid)
print("Enhancement completed!")
client.download_enhanced(uid, str(output_path))
enhanced_files.append(output_path)
except Exception as e:
print(f"Error processing file {input_path}: {e}")
continue
print(f"\n{'=' * 50}")
print(
f"Processing complete! Enhanced {len(enhanced_files)}/{len(input_paths)} files"
)
print(f"{'=' * 50}")
return enhanced_files
def collect_audio_files(paths: List[str]) -> List[Path]:
"""Collect audio files from a mix of files and directories."""
audio_files: List[Path] = []
for p in paths:
path = Path(p).resolve()
if path.is_file():
audio_files.append(path)
elif path.is_dir():
audio_files.extend(Path(f) for f in librosa.util.find_files(path))
else:
print(f"Warning: {p} is not a valid file or directory, skipping")
return [f for f in audio_files if "_enhanced" not in f.stem]
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Enhance audio files using AI-Coustics API"
)
parser.add_argument(
"inputs",
nargs="+",
help="Input audio files or directories to process",
)
parser.add_argument(
"--api-key",
default=os.environ.get("AICOUSTICS_API_KEY"),
help="AI-Coustics API key (default: AICOUSTICS_API_KEY env var)",
)
parser.add_argument(
"--enhancement-level",
type=int,
default=100,
help="Enhancement strength 0-100 (default: 100)",
)
parser.add_argument(
"--model",
default="LARK_V2",
choices=["LARK_V2", "FINCH_V2"],
help="Enhancement model (default: LARK_V2)",
)
args = parser.parse_args()
if not args.api_key:
parser.error("API key required via --api-key or AICOUSTICS_API_KEY env var")
input_files = collect_audio_files(args.inputs)
if not input_files:
print("No valid audio files found to process.")
exit(1)
print(f"Found {len(input_files)} audio files to process")
enhanced_files = enhance_audio_files(
input_paths=input_files,
api_key=args.api_key,
enhancement_level=args.enhancement_level,
enhancement_model=args.model,
)
print("\nEnhanced files:")
for file in enhanced_files:
print(f" - {file}")