Spaces:
Sleeping
Sleeping
| """Utils to load CSV file of audio datasets.""" | |
| import os | |
| import pandas as pd | |
| import shared.utils as su | |
| def configure_paths_sound_of_water( | |
| data_root="/work/piyush/from_nfs2/datasets/SoundOfWater", | |
| ): | |
| paths = { | |
| "data_dir": data_root, | |
| "video_clip_dir": os.path.join(data_root, "videos"), | |
| "audio_clip_dir": os.path.join(data_root, "videos"), | |
| "annot_dir": os.path.join(data_root, "annotations"), | |
| "split_dir": os.path.join(data_root, "splits"), | |
| } | |
| return paths | |
| def load_csv_sound_of_water( | |
| paths: dict, | |
| csv_filters=dict(), | |
| csv_name="localisation.csv", | |
| ds_name="SoundOfWater", | |
| split=None, | |
| check_first_frame_annots=True, | |
| ): | |
| """Loads CSV containing metadata of the dataset.""" | |
| su.log.print_update( | |
| f" [:::] Loading {ds_name}.", | |
| pos="left", | |
| fillchar=".", | |
| ) | |
| # Configure paths | |
| video_clip_dir = paths["video_clip_dir"] | |
| audio_clip_dir = paths["audio_clip_dir"] | |
| # Load main CSV | |
| path = os.path.join( | |
| paths["annot_dir"], csv_name, | |
| ) | |
| assert os.path.exists(path), \ | |
| f"CSV file not found at {path}." | |
| print(" [:::] CSV path:", path) | |
| df = pd.read_csv(path) | |
| # Load side information: containers | |
| container_path = os.path.join( | |
| paths['annot_dir'], "containers.yaml", | |
| ) | |
| assert os.path.exists(container_path) | |
| containers = su.io.load_yml(container_path) | |
| # Update CSV with container information (optional) | |
| update_with_container_info = True | |
| if update_with_container_info: | |
| rows = [] | |
| for row in df.iterrows(): | |
| row = row[1].to_dict() | |
| row.update(containers[row["container_id"]]) | |
| rows.append(row) | |
| df = pd.DataFrame(rows) | |
| print(" [:::] Shape of CSV: ", df.shape) | |
| # 1. Update item_id | |
| df["item_id"] = df.apply( | |
| lambda d: f"{d['video_id']}_{d['start_time']:.1f}_{d['end_time']:.1f}", | |
| axis=1, | |
| ) | |
| # 2. Update video_clip_path | |
| # df["video_path"] = df["video_id"].apply( | |
| # lambda d: os.path.join( | |
| # video_dir, f"{d}.mp4" | |
| # ) | |
| # ) | |
| df["video_clip_path"] = df["item_id"].apply( | |
| lambda d: os.path.join( | |
| video_clip_dir, f"{d}.mp4" | |
| ) | |
| ) | |
| df = df[df["video_clip_path"].apply(os.path.exists)] | |
| print(" [:::] Shape of CSV with available video: ", df.shape) | |
| # 3. Update audio_clip_path | |
| # df["audio_path"] = df["video_id"].apply( | |
| # lambda d: os.path.join( | |
| # audio_dir, f"{d}.mp4" | |
| # ) | |
| # ) | |
| df["audio_clip_path"] = df["item_id"].apply( | |
| lambda d: os.path.join( | |
| audio_clip_dir, f"{d}.mp4" | |
| ) | |
| ) | |
| df = df[df["audio_clip_path"].apply(os.path.exists)] | |
| print(" [:::] Shape of CSV with available audio: ", df.shape) | |
| # Add first frame annotation paths | |
| if check_first_frame_annots: | |
| frame_annot_dir = os.path.join(paths["annot_dir"], "container_bboxes") | |
| df["box_path"] = df["video_id"].apply( | |
| lambda d: os.path.join(frame_annot_dir, f"{d}_box.npy"), | |
| ) | |
| df["mask_path"] = df["video_id"].apply( | |
| lambda d: os.path.join(frame_annot_dir, f"{d}_mask.npy"), | |
| ) | |
| df = df[df["box_path"].apply(os.path.exists)] | |
| df = df[df["mask_path"].apply(os.path.exists)] | |
| print(" [:::] Shape of CSV with first frame annotations: ", df.shape) | |
| # Add split filter | |
| if split is not None and ("item_id" not in csv_filters): | |
| assert "split_dir" in paths | |
| split_path = os.path.join(paths["split_dir"], f"{split}") | |
| assert os.path.exists(split_path), \ | |
| f"Split file not found at {split_path}." | |
| item_ids = su.io.load_txt(split_path) | |
| print(" [:::] Number of item_ids in split:", len(item_ids)) | |
| csv_filters["item_id"] = item_ids | |
| # Apply filter to the CSV | |
| if len(csv_filters) > 0: | |
| df = su.pd_utils.apply_filters(df, csv_filters) | |
| print(" [:::] Shape of CSV after filtering: ", df.shape) | |
| return df | |
| if __name__ == "__main__": | |
| paths = configure_paths_sound_of_water() | |
| df = load_csv_sound_of_water(paths) | |
| row = df.iloc[0].to_dict() | |
| su.log.json_print(row) | |