Spaces:

boffire
/

OpenLID-v3_test

Sleeping

App Files Files Community

boffire commited on Feb 17

Commit

9349334

verified ·

1 Parent(s): 4f83173

Create app.py

Browse files

Files changed (1) hide show

app.py +143 -0

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import fasttext
+from huggingface_hub import hf_hub_download
+import regex
+import gradio as gr
+import os
+# Preprocessing patterns
+NONWORD_REPLACE_STR = r"[^\p{Word}\p{Zs}]|\d"
+NONWORD_REPLACE_PATTERN = regex.compile(NONWORD_REPLACE_STR)
+SPACE_PATTERN = regex.compile(r"\s\s+")
+def preprocess(text):
+    """Preprocess text for language identification."""
+    text = text.strip().replace('\n', ' ').lower()
+    text = regex.sub(SPACE_PATTERN, " ", text)
+    text = regex.sub(NONWORD_REPLACE_PATTERN, "", text)
+    return text
+# Load model once at startup
+print("Loading OpenLID-v3 model...")
+model_path = hf_hub_download(
+    repo_id="HPLT/OpenLID-v3",
+    filename="openlid-v3.bin"
+)
+model = fasttext.load_model(model_path)
+print("Model loaded successfully!")
+def predict_language(text, top_k=3, threshold=0.5):
+    """
+    Predict language of input text.
+    Args:
+        text: Input text to analyze
+        top_k: Number of top predictions to return (1-10)
+        threshold: Confidence threshold (0.0-1.0)
+    """
+    if not text or not text.strip():
+        return "Please enter some text to analyze."
+    # Preprocess
+    processed_text = preprocess(text)
+    if not processed_text.strip():
+        return "Text contains no valid characters for language identification."
+    # Get predictions
+    predictions = model.predict(
+        text=processed_text,
+        k=min(top_k, 10),
+        threshold=threshold,
+        on_unicode_error="strict",
+    )
+    labels, scores = predictions
+    # Format results
+    results = []
+    for label, score in zip(labels, scores):
+        # Remove __label__ prefix and format
+        lang_code = label.replace("__label__", "")
+        confidence = float(score) * 100
+        results.append(f"**{lang_code}**: {confidence:.2f}%")
+    return "\n\n".join(results)
+# Create Gradio interface
+with gr.Blocks(title="OpenLID-v3 Language Identification") as demo:
+    gr.Markdown("""
+    # OpenLID-v3 Language Identifier
+    Identify the language of any text with state-of-the-art accuracy.
+    Supports 194+ language varieties.
+    *Model: [HPLT/OpenLID-v3](https://huggingface.co/HPLT/OpenLID-v3)*
+    """)
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(
+                label="Input Text",
+                placeholder="Enter text to identify its language...",
+                lines=5,
+                max_lines=10
+            )
+            with gr.Row():
+                top_k = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=3,
+                    step=1,
+                    label="Top-K Predictions"
+                )
+                threshold = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.5,
+                    step=0.05,
+                    label="Confidence Threshold"
+                )
+            submit_btn = gr.Button("Identify Language", variant="primary")
+        with gr.Column():
+            output = gr.Markdown(label="Predictions")
+    # Examples with Kabyle and Occitan as defaults
+    gr.Examples(
+        examples=[
+            ["Asebter-a yura s wudem awurman d amagrad s tutlayt taqbaylit."],
+            ["Aqueste es un exemple de tèxte en occitan. L'occitan es una lenga romanica parlada en Occitània."],
+            ["Maskinsjefen er oppteken av å løfta fram dei maritime utdanningane."],
+            ["The quick brown fox jumps over the lazy dog."],
+            ["Le renard brun rapide saute par-dessus le chien paresseux."],
+            ["El rápido zorro marrón salta sobre el perro perezoso."],
+            ["Быстрая коричневая лисица прыгает через ленивую собаку."],
+            ["快速的棕色狐狸跳过了懒惰的狗。"],
+        ],
+        inputs=input_text,
+        label="Try these examples (Kabyle and Occitan featured)"
+    )
+    gr.Markdown("""
+    ### Tips for best results:
+    - Text is automatically preprocessed (lowercased, normalized)
+    - Longer texts generally give more accurate predictions
+    - The model supports 194+ language varieties
+    - Use higher thresholds to filter out uncertain predictions
+    """)
+    # Event handlers
+    submit_btn.click(
+        fn=predict_language,
+        inputs=[input_text, top_k, threshold],
+        outputs=output
+    )
+    input_text.submit(
+        fn=predict_language,
+        inputs=[input_text, top_k, threshold],
+        outputs=output
+    )
+if __name__ == "__main__":
+    demo.launch()