Spaces:
Running
Running
| import requests | |
| import pandas as pd | |
| import streamlit as st | |
| import numpy as np | |
| catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d') | |
| # TODO - extract from the catalog name | |
| BASE_SUMMARY_METRICS = [ | |
| "Catalog last update date", | |
| "Unique Polish speech datasets producers", | |
| "Identified datasets reported in the public domain", | |
| "Datasets available to the public (free and paid)", | |
| "Fraction of reported datasets available to the public [%]", | |
| "Speech data reported in the public domain [hours]", | |
| "Speech data available total [hours]", | |
| "Speech data available free of charge [hours]", | |
| "Speech data available commercially [hours]", | |
| "Reported vs available speech data ratio [%]", | |
| "Transcribed speech data reported in the public domain [hours]", | |
| "Transcribed speech data available total [hours]", | |
| "Transcribed speech data available free of charge [hours]", | |
| "Transcribed speech data available commercially [hours]", | |
| "Reported vs available transcribed speech data ratio [%]", | |
| ] | |
| def download_tsv_from_google_sheet(sheet_url): | |
| # Modify the Google Sheet URL to export it as TSV | |
| tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=') | |
| # Send a GET request to download the TSV file | |
| response = requests.get(tsv_url) | |
| response.encoding = 'utf-8' | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| # Read the TSV content into a pandas DataFrame | |
| from io import StringIO | |
| tsv_content = StringIO(response.text) | |
| df = pd.read_csv(tsv_content, sep='\t', encoding='utf-8') | |
| return df | |
| else: | |
| print("Failed to download the TSV file.") | |
| return None | |
| def load_data_catalog(): | |
| print("Reading speech data catalog") | |
| catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0" | |
| df_catalog = download_tsv_from_google_sheet(catalog_url) | |
| return(df_catalog) | |
| def load_data_taxonomy(): | |
| print("Reading speech data survey taxonomy") | |
| taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057" | |
| df_taxonomy = download_tsv_from_google_sheet(taxonomy_url) | |
| return(df_taxonomy) | |
| def load_bench_catalog(): | |
| print("Reading ASR benchmarks catalog") | |
| catalog_url="https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit#gid=0" | |
| df_catalog = download_tsv_from_google_sheet(catalog_url) | |
| return(df_catalog) | |
| def load_bench_taxonomy(): | |
| print("Reading ASR benchmarks survey taxonomy") | |
| taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057" | |
| df_taxonomy = download_tsv_from_google_sheet(taxonomy_url) | |
| return(df_taxonomy) | |
| def style_floats(val): | |
| """ | |
| Converts float to int if the fractional part is zero, formats floats with two decimal places, | |
| and leaves strings unchanged. | |
| """ | |
| # Check if value is a float and if it can be converted to an int without loss | |
| if isinstance(val, float): | |
| if val % 1 == 0: | |
| return f"{int(val)}" # Convert float with no fractional part to int | |
| else: | |
| return f"{val:.2f}" # Format floats with two decimal places | |
| elif isinstance(val, int): | |
| return f"{val}" # Handle pure integers separately (though likely unnecessary) | |
| else: | |
| return val # Return strings unchanged | |
| def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']): | |
| """ | |
| Function to generate a summary view of datasets by speech type and other relevant metrics. | |
| Args: | |
| - df_cat (pd.DataFrame): The base dataframe containing dataset information. | |
| - col_sum (str or list): The column(s) to sum. | |
| - col_count (str or list): The column(s) to count. | |
| - col_groupby (str or list): The column(s) to group the datasets by. | |
| - col_percent (str): The column to calculate the percentage of total. | |
| Returns: | |
| - pd.DataFrame: A dataframe summarizing datasets by speech type and other relevant metrics. | |
| """ | |
| # Convert col_sum, col_count, and col_groupby to lists if they are not already | |
| if not isinstance(col_sum, list): | |
| col_sum = [col_sum] | |
| if not isinstance(col_count, list): | |
| col_count = [col_count] | |
| if not isinstance(col_groupby, list): | |
| col_groupby = [col_groupby] | |
| # First, ensure that the data types and potential missing values are handled correctly | |
| for col in col_sum: | |
| num_values = df_cat[col].apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(0) | |
| df_cat[col] = num_values | |
| # Aggregating datasets by provided column type | |
| summary = df_cat.groupby(col_groupby).agg({ | |
| **{col: 'sum' for col in col_sum}, | |
| **{col: 'count' for col in col_count} | |
| }).reset_index() | |
| col_name_percent = 'Percent of total' | |
| if col_percent is not None: | |
| # Calculating the percentage | |
| total = summary[col_percent].sum(axis=1) | |
| summary[col_name_percent] = round(total / total.sum() * 100, 2) | |
| # Sorting the summary by the sum of the column | |
| summary.sort_values(by=col_sum[0], ascending=False, inplace=True) | |
| # Replacing index with the groupby column | |
| summary.reset_index(drop=True, inplace=True) | |
| summary.set_index(col_groupby, inplace=True) | |
| # Rename the column to a more descriptive name | |
| if len(col_count) == 0: | |
| col_name_count = None | |
| elif len(col_count) == 1: | |
| col_name_count = 'Count ' + col_count[0] | |
| summary.rename(columns={col_count[0]: col_name_count }, inplace=True) | |
| summary[col_name_count] = summary[col_name_count].astype(int) | |
| else: | |
| #TODO - add support for renaming multiple count columns | |
| pass | |
| # Make the order of columns as follows 'Count Dataset ID', Total transcribed [hours], 'Percent of total' | |
| if col_percent is None: | |
| if col_name_count not in summary.columns: | |
| summary = summary[col_sum] | |
| else: | |
| summary = summary[[col_name_count] + col_sum] | |
| else: | |
| if col_name_count not in summary.columns: | |
| summary = summary[col_sum + [col_name_percent]] | |
| else: | |
| summary = summary[[col_name_count] + col_sum + [col_name_percent]] | |
| # Sort by the provided column col_sort | |
| col_sort = col_groupby if col_sort is None else col_sort | |
| summary.sort_values(by=col_sort, ascending=False, inplace=True) | |
| print(col_sum) | |
| for col in col_sum: | |
| print(col) | |
| #summary[col] = summary[col].apply(lambda x: str(int(x)) if float(x).is_integer() else str(x)) | |
| summary[col] = summary[col].replace(0, np.nan) | |
| return summary | |
| def datasets_count_and_size_standard(df_cat, col_groupby): | |
| return datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=['Size audio transcribed [hours]'], col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count=['Dataset ID']) | |
| def metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid): | |
| #TODO - add number of speakers and recordings | |
| # 'Speaker id info', 'Part of speech annotation', 'Named entity annotation', 'Emotion annotation' | |
| meta_data_cols = ['Gender info', 'Age info', 'Accent info', 'Nativity info', 'Time alignement annotation'] | |
| meta_coverage_all_sets = {} | |
| meta_coverage_free_sets = {} | |
| meta_coverage_paid_sets = {} | |
| col_name_sum_size = 'Size audio transcribed [hours]' | |
| col_name_count = 'Count Dataset ID' | |
| col_name_percent = 'Percent of total' | |
| #, 'Named entity annotation', 'Emotion annotation'] | |
| for meta_data_col in meta_data_cols: | |
| df_datasets_per_meta_paid = datasets_count_and_size_standard(df_cat_available_paid, meta_data_col) | |
| #print(df_datasets_per_meta_paid) | |
| if 'yes' in df_datasets_per_meta_paid.index: | |
| meta_coverage_paid_sets[meta_data_col] = df_datasets_per_meta_paid.loc['yes'] | |
| else: | |
| meta_coverage_paid_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0} | |
| df_datasets_per_meta_all = datasets_count_and_size_standard(df_cat, meta_data_col) | |
| #print(df_datasets_per_meta_all) | |
| # select row where index has value "yes" and column name is "Percent of total" | |
| if 'yes' in df_datasets_per_meta_all.index: | |
| meta_coverage_all_sets[meta_data_col] = df_datasets_per_meta_all.loc['yes'] | |
| else: | |
| meta_coverage_all_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0} | |
| df_datasets_per_meta_free = datasets_count_and_size_standard(df_cat_available_free, meta_data_col) | |
| #print(df_datasets_per_meta_free) | |
| # check if index has value "yes", if not assign 0 | |
| if 'yes' in df_datasets_per_meta_free.index: | |
| meta_coverage_free_sets[meta_data_col] = df_datasets_per_meta_free.loc['yes'] | |
| else: | |
| meta_coverage_free_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0} | |
| #merge all free and paid dataframes | |
| df_meta_free = pd.DataFrame.from_dict(meta_coverage_free_sets, orient='index') | |
| df_meta_free[col_name_count] = df_meta_free[col_name_count].astype(int) | |
| df_meta_paid = pd.DataFrame.from_dict(meta_coverage_paid_sets, orient='index') | |
| df_meta_paid[col_name_count] = df_meta_paid[col_name_count].astype(int) | |
| df_meta_free['Type'] = 'Free' | |
| df_meta_paid['Type'] = 'Paid' | |
| df_meta_all_flat = pd.concat([df_meta_free, df_meta_paid]) | |
| #transform to compare free and paid column by column | |
| df_meta_all_pivot = df_meta_all_flat.reset_index() | |
| df_meta_all_pivot = df_meta_all_pivot.rename(columns={'index':'Metadata'}) | |
| df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent]) | |
| df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int) | |
| #df_meta_all_pivot_styled = df_meta_all_pivot.style.map(style_floats) | |
| #df_meta_all_flat_styled = df_meta_all_flat.style.map(style_floats) | |
| return(df_meta_all_flat, df_meta_all_pivot) | |
| def catalog_summary_statistics(df_cat): | |
| """ | |
| Function to generate summary statistics for the speech data catalog. | |
| Args: | |
| - df_cat (pd.DataFrame): The base dataframe containing dataset information. | |
| Returns: | |
| - pd.DataFrame: A dataframe summarizing the speech data catalog. | |
| """ | |
| col_name_transcribed = 'Size audio transcribed [hours]' | |
| col_name_audio= 'Size audio total [hours]' | |
| # Convert numerical fields to numeric type | |
| df_cat[col_name_audio] = pd.to_numeric(df_cat[col_name_audio], errors='coerce') | |
| df_cat[col_name_transcribed] = pd.to_numeric(df_cat[col_name_transcribed], errors='coerce') | |
| # Filter out non-available datasets | |
| df_cat_available = df_cat[df_cat['Available online'] == 'yes'] | |
| df_cat_free = df_cat[df_cat['Price - non-commercial usage'] == 'free'] | |
| df_cat_commercial = df_cat[df_cat['Price - non-commercial usage'] != 'free'] | |
| # Available and free | |
| df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')] | |
| # Available and paid | |
| df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')] | |
| # Basic Calculations | |
| identified_datasets_count = df_cat.shape[0] | |
| accessible_datasets_count = df_cat_available.shape[0] | |
| unique_producers_count = df_cat['Publisher'].nunique() | |
| accessible_datasets_fraction = round((accessible_datasets_count / identified_datasets_count) * 100, 2) | |
| # Total audio available and other dependent calculations | |
| audio_reported = round(df_cat[col_name_audio].sum(), 2) | |
| audio_accessible = round(df_cat_available[col_name_audio].sum(), 2) | |
| audio_accessible_free = round(df_cat_available_free[col_name_audio].sum(), 2) | |
| audio_accessible_paid = round(df_cat_available_paid[col_name_audio].sum(), 2) | |
| transcribed_audio_reported = round(df_cat[col_name_transcribed].sum(), 2) | |
| transcribed_audio_accessible = round(df_cat_available[col_name_transcribed].sum(), 2) | |
| transcribed_audio_accessible_free = round(df_cat_available_free[col_name_transcribed].sum(), 2) | |
| transcribed_audio_accessible_paid = round(df_cat_available_paid[col_name_transcribed].sum(), 2) | |
| # available vs Reported Speech Material Ratio | |
| accessible_vs_reported_audio_ratio = round((audio_accessible / audio_reported) * 100, 2) | |
| accessible_vs_reported_transcribed_ratio = round((transcribed_audio_accessible / transcribed_audio_reported) * 100, 2) | |
| # Finalizing the metrics dictionary | |
| metrics_dict = { | |
| "Metric": BASE_SUMMARY_METRICS, | |
| "Value": [ | |
| catalog_last_update_date, | |
| unique_producers_count, | |
| identified_datasets_count, | |
| accessible_datasets_count, | |
| accessible_datasets_fraction, | |
| audio_reported, | |
| audio_accessible, | |
| audio_accessible_free, | |
| audio_accessible_paid, | |
| accessible_vs_reported_audio_ratio, | |
| transcribed_audio_reported, | |
| transcribed_audio_accessible, | |
| transcribed_audio_accessible_free, | |
| transcribed_audio_accessible_paid, | |
| accessible_vs_reported_transcribed_ratio, | |
| ] | |
| } | |
| # Convert the dictionary into a DataFrame | |
| metrics_df = pd.DataFrame(metrics_dict) | |
| metrics_df.reset_index(drop=True, inplace=True) | |
| metrics_df.set_index("Metric", inplace=True) | |
| return(metrics_df) | |
| def right_align(s, props='text-align: right;'): | |
| return props | |
| def left_align(s, props='text-align: left;'): | |
| return props |