Václav Volhejn

Enhance Ears and VCTK datasets

a0de156 3 months ago

8.27 kB

	"""Enhance audio files using AI-Coustics API."""

	# /// script
	# requires-python = ">=3.12"
	# dependencies = [
	# "librosa",
	# "requests",
	# "soundfile",
	# "tqdm",
	# ]
	# ///
	import json
	import os
	import time
	from pathlib import Path
	from typing import List, Optional

	import librosa
	import requests
	import soundfile as sf
	import tqdm


	class AiCousticsEnhancer:
	"""Client for AI-Coustics audio enhancement API"""

	def __init__(self, api_key: str):
	"""
	Initialize the AI-Coustics API client

	Args:
	api_key: Your AI-Coustics API key
	"""
	self.api_key = api_key
	self.base_url = "https://api.ai-coustics.io/v2"
	self.headers = {"X-API-Key": self.api_key}

	def upload_audio(
	self,
	file_path: str,
	enhancement_level: int = 100,
	enhancement_model: str = "LARK_V2",
	loudness_target: int = -19,
	true_peak: int = -1,
	transcode: str = "WAV",
	) -> dict:
	"""
	Upload an audio file for enhancement

	Args:
	file_path: Path to the audio file
	enhancement_level: Enhancement strength (0-100)
	enhancement_model: Model to use (LARK_V2 or FINCH_V2)
	loudness_target: Target loudness in LUFS
	true_peak: True peak level in dBFS
	transcode: Output format (WAV, MP3, etc.)

	Returns:
	Response dictionary with uid and metadata
	"""
	url = f"{self.base_url}/medias"

	# Prepare the enhancement parameters
	media_enhancement = {
	"loudness_target": loudness_target,
	"true_peak": true_peak,
	"enhancement_level": enhancement_level,
	"enhancement_model": enhancement_model,
	"transcode": transcode,
	}

	# Upload file
	with open(file_path, "rb") as f:
	files = {"file": f}
	data = {"media_enhancement": json.dumps(media_enhancement)}

	response = requests.post(url, headers=self.headers, files=files, data=data)
	response.raise_for_status()

	return response.json()

	def check_status(self, uid: str) -> dict:
	"""
	Check the processing status of an uploaded media file

	Args:
	uid: Unique identifier returned from upload

	Returns:
	Metadata dictionary with current status
	"""
	url = f"{self.base_url}/medias/{uid}/metadata"
	response = requests.get(url, headers=self.headers)
	response.raise_for_status()
	return response.json()

	def wait_for_completion(
	self, uid: str, poll_interval: int = 2, timeout: int = 300
	) -> dict:
	"""
	Poll the API until processing is complete

	Args:
	uid: Unique identifier returned from upload
	poll_interval: Seconds between status checks
	timeout: Maximum seconds to wait

	Returns:
	Final metadata dictionary
	"""
	start_time = time.time()

	while time.time() - start_time < timeout:
	metadata = self.check_status(uid)
	status = metadata.get("enhancement_status")

	print(f"Status: {status}")

	if status == "COMPLETED":
	return metadata
	elif status == "FAILED":
	raise Exception(f"Enhancement failed: {metadata}")

	time.sleep(poll_interval)

	raise TimeoutError(f"Processing did not complete within {timeout} seconds")

	def download_enhanced(self, uid: str, output_path: str):
	"""
	Download the enhanced audio file

	Args:
	uid: Unique identifier returned from upload
	output_path: Path where to save the enhanced file
	"""
	url = f"{self.base_url}/medias/{uid}/file"
	response = requests.get(url, headers=self.headers, stream=True)
	response.raise_for_status()

	# Save the file
	with open(output_path, "wb") as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)

	print(f"Enhanced audio saved to: {output_path}")


	def get_enhanced_output_path(input_path: Path) -> Path:
	"""Generate output path by adding '_enhanced' suffix to the name."""
	return input_path.with_stem(input_path.stem + "_enhanced")


	def enhance_audio_files(
	input_paths: List[Path],
	api_key: str,
	enhancement_level: int = 90,
	enhancement_model: str = "LARK_V2",
	) -> List[Path]:
	"""
	Process a list of audio files through AI-Coustics API.

	Args:
	input_paths: List of paths to audio files
	api_key: Your AI-Coustics API key
	enhancement_level: Enhancement strength (0-100)
	enhancement_model: Model to use (LARK_V2 or FINCH_V2)

	Returns:
	List of paths to enhanced audio files
	"""
	client = AiCousticsEnhancer(api_key)

	enhanced_files = []
	valid_input_paths = [
	p for p in input_paths
	if p.exists() and not get_enhanced_output_path(p).exists()
	]

	for input_path in tqdm.tqdm(valid_input_paths):
	output_path = get_enhanced_output_path(input_path)
	if output_path.exists():
	print(f"Found {output_path}, skipping")
	continue
	output_path.parent.mkdir(parents=True, exist_ok=True)

	try:
	print("Uploading to AI-Coustics...")
	result = client.upload_audio(
	str(input_path),
	enhancement_level=enhancement_level,
	enhancement_model=enhancement_model,
	)

	uid = result["uid"]
	print(f"Uploaded successfully. UID: {uid}")

	print("Waiting for enhancement to complete...")
	client.wait_for_completion(uid)
	print("Enhancement completed!")

	client.download_enhanced(uid, str(output_path))
	enhanced_files.append(output_path)

	except Exception as e:
	print(f"Error processing file {input_path}: {e}")
	continue

	print(f"\n{'=' * 50}")
	print(
	f"Processing complete! Enhanced {len(enhanced_files)}/{len(input_paths)} files"
	)
	print(f"{'=' * 50}")

	return enhanced_files


	def collect_audio_files(paths: List[str]) -> List[Path]:
	"""Collect audio files from a mix of files and directories."""
	audio_files: List[Path] = []
	for p in paths:
	path = Path(p).resolve()
	if path.is_file():
	audio_files.append(path)
	elif path.is_dir():
	audio_files.extend(Path(f) for f in librosa.util.find_files(path))
	else:
	print(f"Warning: {p} is not a valid file or directory, skipping")
	return [f for f in audio_files if "_enhanced" not in f.stem]


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(
	description="Enhance audio files using AI-Coustics API"
	)
	parser.add_argument(
	"inputs",
	nargs="+",
	help="Input audio files or directories to process",
	)
	parser.add_argument(
	"--api-key",
	default=os.environ.get("AICOUSTICS_API_KEY"),
	help="AI-Coustics API key (default: AICOUSTICS_API_KEY env var)",
	)
	parser.add_argument(
	"--enhancement-level",
	type=int,
	default=100,
	help="Enhancement strength 0-100 (default: 100)",
	)
	parser.add_argument(
	"--model",
	default="LARK_V2",
	choices=["LARK_V2", "FINCH_V2"],
	help="Enhancement model (default: LARK_V2)",
	)
	args = parser.parse_args()

	if not args.api_key:
	parser.error("API key required via --api-key or AICOUSTICS_API_KEY env var")

	input_files = collect_audio_files(args.inputs)
	if not input_files:
	print("No valid audio files found to process.")
	exit(1)

	print(f"Found {len(input_files)} audio files to process")

	enhanced_files = enhance_audio_files(
	input_paths=input_files,
	api_key=args.api_key,
	enhancement_level=args.enhancement_level,
	enhancement_model=args.model,
	)

	print("\nEnhanced files:")
	for file in enhanced_files:
	print(f" - {file}")