Spaces:
Runtime error
Runtime error
Salif SAWADOGO
commited on
Commit
·
8086e8b
1
Parent(s):
4b8802f
first commit
Browse files- .gitignore +8 -0
- Dockerfile +15 -0
- app/__init__.py +0 -0
- app/app.py +16 -0
- app/assets/assets/segment_6.mp3 +0 -0
- app/assets/assets/segment_7.mp3 +0 -0
- app/assets/assets/segment_8.mp3 +0 -0
- app/assets/assets/segment_9.mp3 +0 -0
- app/assets/flavicon.png +0 -0
- app/assets/logo.png +0 -0
- app/index.py +77 -0
- app/pages/Annotations/__init__.py +6 -0
- app/pages/Annotations/callbacks.py +0 -0
- app/pages/Annotations/layout.py +152 -0
- app/pages/Contributeurs/__init__.py +6 -0
- app/pages/Contributeurs/layout.py +12 -0
- app/pages/Home/__init__.py +6 -0
- app/pages/Home/layout.py +90 -0
- app/pages/__init__.py +3 -0
- app/utils.py +63 -0
- requirements.txt +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv
|
| 2 |
+
**__pycache__**
|
| 3 |
+
**.pyc**
|
| 4 |
+
**/env/**
|
| 5 |
+
**/.venv/**
|
| 6 |
+
**/*.egg-info
|
| 7 |
+
**/*parquet
|
| 8 |
+
build/
|
Dockerfile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
FROM python:3.11
|
| 3 |
+
|
| 4 |
+
RUN useradd -m -u 1000 user
|
| 5 |
+
USER user
|
| 6 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 7 |
+
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 11 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 12 |
+
|
| 13 |
+
COPY --chown=user . /app
|
| 14 |
+
|
| 15 |
+
CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app/__init__.py
ADDED
|
File without changes
|
app/app.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import dash
|
| 3 |
+
import dash_bootstrap_components as dbc
|
| 4 |
+
|
| 5 |
+
route_preffix = os.environ.get("EXTRAPATH", "") + "/"
|
| 6 |
+
app_name = "MooreFRCollection"
|
| 7 |
+
|
| 8 |
+
app = dash.Dash(
|
| 9 |
+
__name__,
|
| 10 |
+
title=app_name,
|
| 11 |
+
suppress_callback_exceptions=True,
|
| 12 |
+
external_stylesheets=[dbc.themes.BOOTSTRAP],
|
| 13 |
+
routes_pathname_prefix=route_preffix,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
app._favicon = "logo.png" # chang dash favicon to the project logo
|
app/assets/assets/segment_6.mp3
ADDED
|
Binary file (10.4 kB). View file
|
|
|
app/assets/assets/segment_7.mp3
ADDED
|
Binary file (8.3 kB). View file
|
|
|
app/assets/assets/segment_8.mp3
ADDED
|
Binary file (14.1 kB). View file
|
|
|
app/assets/assets/segment_9.mp3
ADDED
|
Binary file (14.4 kB). View file
|
|
|
app/assets/flavicon.png
ADDED
|
|
app/assets/logo.png
ADDED
|
app/index.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
from dash.dependencies import Input, Output
|
| 3 |
+
from dash import dcc, html
|
| 4 |
+
import dash_bootstrap_components as dbc
|
| 5 |
+
from app import app, app_name
|
| 6 |
+
from pages import page_list, Home as home
|
| 7 |
+
|
| 8 |
+
# Define styles as constants
|
| 9 |
+
CONTENT_STYLE = {
|
| 10 |
+
"margin-left": "2rem",
|
| 11 |
+
"margin-right": "2rem",
|
| 12 |
+
"padding": "2rem 1rem",
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
TOPBAR_STYLE = {
|
| 16 |
+
"padding": "1rem 1rem",
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
# Load and encode the logo image
|
| 20 |
+
with open("assets/logo.png", "rb") as image_file:
|
| 21 |
+
encoded_logo = base64.b64encode(image_file.read()).decode()
|
| 22 |
+
|
| 23 |
+
logo = html.Img(src=f"data:image/jpg;base64,{encoded_logo}", height="40px")
|
| 24 |
+
|
| 25 |
+
# Branding component with logo and app name
|
| 26 |
+
branding = dbc.Row(
|
| 27 |
+
[dbc.Col(logo), dbc.Col(dbc.NavbarBrand("MooreFRCollection", className="ms-2"))],
|
| 28 |
+
align="center",
|
| 29 |
+
className="g-0",
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Dynamically create navigation links from the page list
|
| 33 |
+
nav_links = [
|
| 34 |
+
dbc.NavLink(page.name, href=page.path, id=f"navlink_{page.id_}", active="exact")
|
| 35 |
+
for page in page_list
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
# Navbar component with dynamic pages and branding
|
| 39 |
+
navbar = dbc.Navbar(
|
| 40 |
+
[
|
| 41 |
+
html.A(branding, href=home.path, id=home.id_, style={"textDecoration": "none"}),
|
| 42 |
+
dbc.Nav(
|
| 43 |
+
nav_links,
|
| 44 |
+
navbar=True,
|
| 45 |
+
className="ml-auto",
|
| 46 |
+
style={"font-size": "18px"},
|
| 47 |
+
pills=True,
|
| 48 |
+
),
|
| 49 |
+
],
|
| 50 |
+
color="#000080",
|
| 51 |
+
dark=True,
|
| 52 |
+
sticky="top",
|
| 53 |
+
style=TOPBAR_STYLE,
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# Callback to control page content based on the URL
|
| 58 |
+
@app.callback(Output("page-content", "children"), [Input("url", "pathname")])
|
| 59 |
+
def display_page(pathname):
|
| 60 |
+
matched_page = next((page for page in page_list if pathname == page.path), None)
|
| 61 |
+
return matched_page.layout if matched_page else home.layout
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# App layout definition
|
| 65 |
+
app.layout = html.Div(
|
| 66 |
+
[dcc.Location(id="url"), navbar, html.Div(id="page-content", style=CONTENT_STYLE)],
|
| 67 |
+
style={
|
| 68 |
+
"backgroundColor": "#f8f9fa",
|
| 69 |
+
},
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Define the WSGI server application
|
| 73 |
+
application = app.server
|
| 74 |
+
|
| 75 |
+
# Entry point for running the app
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
application.run(debug=True, port=8080, host="0.0.0.0")
|
app/pages/Annotations/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .layout import layout
|
| 2 |
+
from app import route_preffix
|
| 3 |
+
|
| 4 |
+
id_ = __name__.split(".")[1]
|
| 5 |
+
name = id_.replace("_", " ").capitalize()
|
| 6 |
+
path = route_preffix + id_
|
app/pages/Annotations/callbacks.py
ADDED
|
File without changes
|
app/pages/Annotations/layout.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import dash
|
| 3 |
+
import dash_bootstrap_components as dbc
|
| 4 |
+
from dash import dcc, html, Input, Output, callback, State
|
| 5 |
+
import datetime
|
| 6 |
+
from utils import get_audio_paths, load_data, load_persistent_data, save_persistent_data
|
| 7 |
+
from app import app
|
| 8 |
+
|
| 9 |
+
# Initialisation de l'application Dash
|
| 10 |
+
# Configuration du dossier des fichiers audio
|
| 11 |
+
AUDIO_FOLDER = Path("./assets/assets")
|
| 12 |
+
PERSIST_FILE = "results.json"
|
| 13 |
+
DATA_FILE = "C:/Users/sawal/OneDrive/Bureau/Projets X/datasets/datasets/bible_data_moore.parquet/page=S%C9%A9ngre%2F1"
|
| 14 |
+
|
| 15 |
+
# Chargement initial des fichiers audio et des suggestions
|
| 16 |
+
audio_paths = get_audio_paths(AUDIO_FOLDER)
|
| 17 |
+
|
| 18 |
+
possible_values = load_data(DATA_FILE)
|
| 19 |
+
|
| 20 |
+
print(111111111111)
|
| 21 |
+
print(audio_paths[0])
|
| 22 |
+
|
| 23 |
+
def create_layout(audio_paths, possible_values):
|
| 24 |
+
"""Crée et renvoie le layout principal de l'application."""
|
| 25 |
+
return dbc.Container([
|
| 26 |
+
dbc.Row([
|
| 27 |
+
dbc.Col(html.H1("Outil de transcription audio", className="text-center my-4 text-primary"), width=12)
|
| 28 |
+
]),
|
| 29 |
+
dbc.Row([
|
| 30 |
+
dbc.Col([
|
| 31 |
+
dbc.Input(
|
| 32 |
+
id="user-info",
|
| 33 |
+
placeholder="Entrez votre email, pseudonyme ou nom pour qu'on vous crédite",
|
| 34 |
+
type="text",
|
| 35 |
+
className="mb-3"
|
| 36 |
+
),
|
| 37 |
+
dbc.Button("Démarrer", id="start-button", color="primary", className="w-100")
|
| 38 |
+
], width=12)
|
| 39 |
+
]),
|
| 40 |
+
dbc.Row(
|
| 41 |
+
id="transcription-section",
|
| 42 |
+
style={'display': 'none'},
|
| 43 |
+
children=[
|
| 44 |
+
dbc.Col([
|
| 45 |
+
dbc.Card([
|
| 46 |
+
dbc.CardHeader("Lecture audio"),
|
| 47 |
+
dbc.CardBody([
|
| 48 |
+
html.Audio(
|
| 49 |
+
id="audio-player",
|
| 50 |
+
src=audio_paths[0] if audio_paths else "",
|
| 51 |
+
controls=True,
|
| 52 |
+
autoPlay=False,
|
| 53 |
+
className="w-100"
|
| 54 |
+
)
|
| 55 |
+
])
|
| 56 |
+
], className="mb-4 shadow")
|
| 57 |
+
], width=12),
|
| 58 |
+
dbc.Col([
|
| 59 |
+
dbc.Card([
|
| 60 |
+
dbc.CardHeader("Suggestions de transcriptions"),
|
| 61 |
+
dbc.CardBody([
|
| 62 |
+
dcc.Checklist(
|
| 63 |
+
id="suggestion-checklist",
|
| 64 |
+
options=[{"label": t, "value": t} for t in possible_values[:6]],
|
| 65 |
+
value=[],
|
| 66 |
+
style={"columns": "3", "column-gap": "1rem"}
|
| 67 |
+
)
|
| 68 |
+
])
|
| 69 |
+
], className="mb-4 shadow")
|
| 70 |
+
], width=12),
|
| 71 |
+
dbc.Col([
|
| 72 |
+
dbc.Button(
|
| 73 |
+
"Soumettre",
|
| 74 |
+
id="submit-button",
|
| 75 |
+
n_clicks=0,
|
| 76 |
+
color="secondary",
|
| 77 |
+
className="w-100",
|
| 78 |
+
style={"marginTop": "20px"}
|
| 79 |
+
)
|
| 80 |
+
], width=12),
|
| 81 |
+
dbc.Col([
|
| 82 |
+
html.Div(id="confirmation-message", className="text-success text-center mt-3")
|
| 83 |
+
], width=12)
|
| 84 |
+
]
|
| 85 |
+
)
|
| 86 |
+
])
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
layout = create_layout(audio_paths, possible_values)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@app.callback(
|
| 94 |
+
Output("transcription-section", "style"),
|
| 95 |
+
Input("start-button", "n_clicks"),
|
| 96 |
+
State("user-info", "value")
|
| 97 |
+
)
|
| 98 |
+
def start_transcription(n_clicks, user_info):
|
| 99 |
+
"""Affiche la section de transcription lorsque l'utilisateur démarre."""
|
| 100 |
+
if n_clicks is not None and n_clicks > 0 and user_info:
|
| 101 |
+
return {'display': 'block'}
|
| 102 |
+
return {'display': 'none'}
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
@app.callback(
|
| 107 |
+
[Output("audio-player", "src"),
|
| 108 |
+
Output("suggestion-checklist", "options"),
|
| 109 |
+
Output("confirmation-message", "children")],
|
| 110 |
+
[Input("submit-button", "n_clicks")],
|
| 111 |
+
[State("suggestion-checklist", "value"),
|
| 112 |
+
State("user-info", "value"),
|
| 113 |
+
State("audio-player", "src")]
|
| 114 |
+
)
|
| 115 |
+
def update_transcription(n_clicks, selected_transcriptions, user_info, current_audio):
|
| 116 |
+
"""Gère l'enregistrement des transcriptions et la navigation dans les fichiers audio."""
|
| 117 |
+
global audio_paths, possible_values
|
| 118 |
+
|
| 119 |
+
if n_clicks > 0 and audio_paths:
|
| 120 |
+
print(selected_transcriptions)
|
| 121 |
+
# Enregistrer les résultats
|
| 122 |
+
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 123 |
+
entry = {
|
| 124 |
+
"segment_path": current_audio,
|
| 125 |
+
"transcriptions": selected_transcriptions,
|
| 126 |
+
"timestamp": timestamp,
|
| 127 |
+
"user_id": "current_user"
|
| 128 |
+
}
|
| 129 |
+
persistent_data = load_persistent_data(PERSIST_FILE)
|
| 130 |
+
persistent_data.append(entry)
|
| 131 |
+
save_persistent_data(persistent_data, PERSIST_FILE)
|
| 132 |
+
|
| 133 |
+
# Charger le fichier audio suivant
|
| 134 |
+
audio_paths.pop(0)
|
| 135 |
+
next_audio = audio_paths[0] if audio_paths else ""
|
| 136 |
+
|
| 137 |
+
# Mettre à jour les suggestions
|
| 138 |
+
for value in selected_transcriptions:
|
| 139 |
+
if value in possible_values:
|
| 140 |
+
possible_values.remove(value)
|
| 141 |
+
next_options = [{"label": t, "value": t} for t in possible_values[:6]]
|
| 142 |
+
|
| 143 |
+
# Message de confirmation
|
| 144 |
+
confirmation_message = (
|
| 145 |
+
f"Transcriptions sélectionnées : {', '.join(selected_transcriptions)}"
|
| 146 |
+
if selected_transcriptions else "Aucune transcription sélectionnée."
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
return next_audio, next_options, confirmation_message
|
| 150 |
+
|
| 151 |
+
return dash.no_update, dash.no_update, "Tous les fichiers ont été traités !" if not audio_paths else dash.no_update
|
| 152 |
+
|
app/pages/Contributeurs/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .layout import layout
|
| 2 |
+
from app import route_preffix
|
| 3 |
+
|
| 4 |
+
id_ = __name__.split(".")[1]
|
| 5 |
+
name = id_.replace("_", " ").capitalize()
|
| 6 |
+
path = route_preffix + id_
|
app/pages/Contributeurs/layout.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dash import dcc, html, Input, Output, callback
|
| 2 |
+
import dash_bootstrap_components as dbc
|
| 3 |
+
|
| 4 |
+
layout = dbc.Container(
|
| 5 |
+
[
|
| 6 |
+
dcc.Markdown(
|
| 7 |
+
f"""
|
| 8 |
+
Ceci est un example
|
| 9 |
+
"""
|
| 10 |
+
)
|
| 11 |
+
]
|
| 12 |
+
)
|
app/pages/Home/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .layout import layout
|
| 2 |
+
from app import route_preffix
|
| 3 |
+
|
| 4 |
+
id_ = __name__.split(".")[1]
|
| 5 |
+
name = id_.replace("_", " ").capitalize()
|
| 6 |
+
path = route_preffix + id_
|
app/pages/Home/layout.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dash import dcc, html, Input, Output, callback
|
| 2 |
+
import dash_bootstrap_components as dbc
|
| 3 |
+
|
| 4 |
+
layout = dbc.Container(
|
| 5 |
+
[
|
| 6 |
+
dcc.Markdown(
|
| 7 |
+
f"""
|
| 8 |
+
|
| 9 |
+
# MooreFRCollection
|
| 10 |
+
|
| 11 |
+
**MooreFRCollection** est un projet collaboratif et ouvert visant à construire un corpus bilingue **Mooré-Français** pour soutenir la recherche et le développement de technologies linguistiques adaptées au contexte burkinabé.
|
| 12 |
+
|
| 13 |
+
L'objectif principal est de fournir des données essentielles pour entraîner, tester et affiner des modèles de traduction et d'autres outils d'intelligence artificielle (IA) en lien avec la langue **Mooré**, une des langues nationales du Burkina Faso.
|
| 14 |
+
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
## **Construction de la version audio**
|
| 18 |
+
|
| 19 |
+
Actuellement, nous nous concentrons sur la **partie audio** du projet. Cette phase implique la collecte, le traitement et l'alignement des segments audio et textuels pour produire un corpus riche et diversifié.
|
| 20 |
+
|
| 21 |
+
Les données audio sont collectées à partir de :
|
| 22 |
+
|
| 23 |
+
1. **Textes bibliques de JW.ORG** :
|
| 24 |
+
Les contenus ont été extraits efficacement grâce à l'outil [jwsoup](https://pypi.org/project/jwsoup/). Les segments textuels et leurs correspondances audio sont alignés pour fournir une ressource précieuse pour le traitement automatique des langues.
|
| 25 |
+
|
| 26 |
+
2. **Alignement et segmentation audio** :
|
| 27 |
+
Les fichiers audio sont traités et segmentés en unités exploitables, accompagnées de transcriptions bilingues (Mooré-Français). Chaque segment est soigneusement vérifié pour garantir sa qualité et son utilité dans des applications variées.
|
| 28 |
+
|
| 29 |
+
---
|
| 30 |
+
|
| 31 |
+
## **Applications**
|
| 32 |
+
|
| 33 |
+
Le corpus **MooreFRCollection** ouvre la voie à une multitude d'applications :
|
| 34 |
+
|
| 35 |
+
1. 🔤 **Traduction automatique**
|
| 36 |
+
Développement et évaluation de systèmes de traduction automatiques, permettant la conversion fluide du Mooré vers le Français et inversement.
|
| 37 |
+
|
| 38 |
+
2. 🧠 **Recherche linguistique**
|
| 39 |
+
Analyse approfondie des structures syntaxiques, lexicales et phonétiques spécifiques au Mooré, pour enrichir les études linguistiques.
|
| 40 |
+
|
| 41 |
+
3. 📊 **Apprentissage supervisé**
|
| 42 |
+
Entraînement de modèles d'apprentissage machine sur des tâches variées, comme la reconnaissance vocale ou l'analyse sentimentale en Mooré.
|
| 43 |
+
|
| 44 |
+
4. 📚 **Applications éducatives**
|
| 45 |
+
Soutien aux enseignants, étudiants, et locuteurs natifs pour explorer les interactions linguistiques entre le Mooré et le Français.
|
| 46 |
+
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
## **Appel à contributions**
|
| 50 |
+
|
| 51 |
+
Nous invitons toute personne intéressée à contribuer au développement de **MooreFRCollections**. Votre aide peut prendre plusieurs formes :
|
| 52 |
+
|
| 53 |
+
- **Partager des données** : Soumettez des textes ou fichiers audio en Mooré, ou des données bilingues Mooré-Français.
|
| 54 |
+
- **Participer à l’annotation** : Aidez-nous à valider et enrichir les transcriptions et traductions existantes.
|
| 55 |
+
- **Suggérer des idées** : Proposez de nouvelles approches pour la collecte, le traitement ou les usages potentiels du corpus.
|
| 56 |
+
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
### **Contribuer via HuggingFace**
|
| 60 |
+
|
| 61 |
+
Pour apporter vos contributions directement au dépôt du projet sur HuggingFace, suivez ces étapes simples :
|
| 62 |
+
|
| 63 |
+
1. **Configurer votre environnement Git** :
|
| 64 |
+
- Lisez le [guide sur la dépréciation des mots de passe Git](https://huggingface.co/blog/password-git-deprecation).
|
| 65 |
+
- Configurez [Git SSH](https://huggingface.co/docs/hub/security-git-ssh) pour des connexions sécurisées.
|
| 66 |
+
|
| 67 |
+
2. **Changer l'URL distante** pour utiliser le dépôt HuggingFace :
|
| 68 |
+
```bash
|
| 69 |
+
git remote set-url origin [email protected]:datasets/sawadogosalif/MooreFRCollections/
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
3. **Créer une nouvelle branche** pour vos contributions :
|
| 73 |
+
```bash
|
| 74 |
+
git checkout -b nom-de-votre-branche
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
4. **Soumettre vos modifications** :
|
| 78 |
+
Faites un push de vos changements et créez une demande de fusion pour qu'elle soit examinée.
|
| 79 |
+
|
| 80 |
+
Nous vous encourageons également à demander l'accès à la branche de développement pour collaborer plus directement sur les améliorations.
|
| 81 |
+
|
| 82 |
+
---
|
| 83 |
+
|
| 84 |
+
Merci de faire partie de cette initiative visant à promouvoir les langues locales par la technologie ! **Votre participation fait la différence.**
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
"""
|
| 88 |
+
)
|
| 89 |
+
]
|
| 90 |
+
)
|
app/pages/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from . import Home, Annotations, Contributeurs
|
| 2 |
+
|
| 3 |
+
page_list = [Home, Annotations, Contributeurs]
|
app/utils.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import re
|
| 4 |
+
import datetime
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
def get_audio_paths(folder: Path) -> list[str]:
|
| 9 |
+
"""Récupère et trie les chemins des fichiers audio dans le dossier spécifié."""
|
| 10 |
+
def extract_number(file_path):
|
| 11 |
+
match = re.search(r"segment_(\d+)", file_path)
|
| 12 |
+
return int(match.group(1)) if match else float('inf') # Trier les fichiers sans numéros en dernier
|
| 13 |
+
|
| 14 |
+
audio_paths = list(folder.glob("*.mp3"))
|
| 15 |
+
# Supprimer les deux premiers fichiers
|
| 16 |
+
|
| 17 |
+
audio_paths = [f"{audio_path.as_posix()}" for audio_path in audio_paths[2:]]
|
| 18 |
+
print(audio_paths)
|
| 19 |
+
return sorted(audio_paths, key=extract_number)
|
| 20 |
+
|
| 21 |
+
def clean_text(text: str) -> str:
|
| 22 |
+
"""Nettoie les caractères indésirables dans une chaîne de texte."""
|
| 23 |
+
text = re.sub(r"\+\s*\.", ".", text)
|
| 24 |
+
text = re.sub(r"\*\s*\+\s*;", ";", text)
|
| 25 |
+
text = re.sub(r"\*\s*\+", "", text)
|
| 26 |
+
text = text.replace(" + ", " ").replace(" * ", " ").replace("+", " ")
|
| 27 |
+
text = re.sub(r'["“”]', '', text)
|
| 28 |
+
return text.strip()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def splitter(text: str) -> list[str]:
|
| 32 |
+
"""Divise une chaîne en segments basés sur des séparateurs spécifiques."""
|
| 33 |
+
return re.split(r"[,:;.]", clean_text(text))
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def flatten_nested_values(nested_values: pd.Series) -> list[str]:
|
| 37 |
+
"""Aplati une liste imbriquée de valeurs textuelles en une liste simple."""
|
| 38 |
+
flattened = []
|
| 39 |
+
for group in nested_values:
|
| 40 |
+
for item in group:
|
| 41 |
+
cleaned_item = re.sub(r"^\d+\s*", "", item).strip()
|
| 42 |
+
if cleaned_item:
|
| 43 |
+
flattened.append(cleaned_item)
|
| 44 |
+
return flattened
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def load_data(file_path: str) -> list[str]:
|
| 48 |
+
"""Charge les données textuelles et génère une liste de suggestions."""
|
| 49 |
+
data = pd.read_parquet(file_path, engine="pyarrow")
|
| 50 |
+
possible_values = data["verse_text"].apply(splitter)
|
| 51 |
+
return flatten_nested_values(possible_values)
|
| 52 |
+
|
| 53 |
+
# Charger les données persistantes existantes
|
| 54 |
+
def load_persistent_data(file):
|
| 55 |
+
if os.path.exists(file):
|
| 56 |
+
with open(file, 'r', encoding='utf-8') as f:
|
| 57 |
+
return json.load(f)
|
| 58 |
+
return []
|
| 59 |
+
|
| 60 |
+
# Sauvegarder les données persistantes
|
| 61 |
+
def save_persistent_data(data, file):
|
| 62 |
+
with open(file, 'w', encoding='utf-8') as f:
|
| 63 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
requirements.txt
ADDED
|
Binary file (1.41 kB). View file
|
|
|