# app.py streamlet run """ Philological AI Ensemble — Streamlit (HF-ready, cleaned & optimized) Features: - Safe dataset loading (Philological_7525.xlsx) - Language detection: BERT CLS embeddings -> sklearn MLP (trainable through UI) - Restoration: Seq2Seq encoder-decoder LSTM (trainable through UI) with dataset exact-match fallback - Meaning interpreter: CountVectorizer -> sklearn MLP (trainable through UI) - Batch file evaluation (CSV/XLSX with 'text' column) - HTML/JS controls: full-page screenshot, element screenshot, screen recording (MediaRecorder), print - Saves models/tokenizers to disk (optional) - Avoids Streamlit caching issues by not passing unhashable objects into @st.cache_data - Hugging Face Spaces compatible (run via `streamlit run app.py` — HF runs this automatically) """ import streamlit as st import pandas as pd import numpy as np import io import os import pickle import time import torch from transformers import AutoTokenizer, AutoModel from sklearn.neural_network import MLPClassifier from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from tensorflow.keras.models import Model, load_model, Sequential from tensorflow.keras.layers import Input, Embedding, LSTM, Dense from tensorflow.keras.preprocessing.text import Tokenizer as KerasTokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences import tensorflow as tf # ------------------------- # Configuration / constants # ------------------------- st.set_page_config(page_title="Philological AI Ensemble", layout="wide") APP_TITLE = "📚 Philological AI Ensemble" DATA_PATH = "Philological_7525.xlsx" SEQ2SEQ_MODEL_PATH = "restoration_seq2seq.h5" SEQ2SEQ_TOKENIZERS_PATH = "restoration_tokenizers.pkl" LANG_MLP_PATH = "lang_mlp.pkl" MEAN_MLP_PATH = "mean_mlp.pkl" TORCH_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # ------------------------- # Top HTML / JS controls # ------------------------- st.markdown( """

Idle

""", unsafe_allow_html=True, ) st.title(APP_TITLE) # ------------------------- # Dataset loading (safe) # ------------------------- @st.cache_data def load_dataset(path=DATA_PATH): if not os.path.exists(path): return None try: df_local = pd.read_excel(path) except Exception as e: # fallback: try csv try: df_local = pd.read_csv(path.replace('.xlsx', '.csv')) except Exception: raise e required = ['original_text', 'language', 'corrupted_text', 'restored_text', 'english_meaning'] missing = [c for c in required if c not in df_local.columns] if missing: raise KeyError(f"Missing required columns: {missing}") df_local = df_local.dropna(subset=required).reset_index(drop=True) return df_local df = None try: df = load_dataset() except Exception as e: # don't stop; show message and allow uploading via UI st.warning(f"Could not load dataset from '{DATA_PATH}': {e}") # Sidebar dataset info / upload st.sidebar.header("Dataset") if df is None: st.sidebar.warning("Dataset not found on disk. Upload Philological_7525.xlsx below or use sample.") else: st.sidebar.write(f"Rows: {len(df)}") st.sidebar.write("Languages: " + ", ".join(sorted(df['language'].unique()))) uploaded = st.sidebar.file_uploader("Upload dataset (.xlsx or .csv) to replace", type=["xlsx", "csv"]) if uploaded: try: if uploaded.name.endswith(".xlsx"): df2 = pd.read_excel(uploaded) else: df2 = pd.read_csv(uploaded) required = ['original_text', 'language', 'corrupted_text', 'restored_text', 'english_meaning'] missing = [c for c in required if c not in df2.columns] if missing: st.sidebar.error(f"Uploaded file missing columns: {missing}") else: df2 = df2.dropna(subset=required).reset_index(drop=True) df = df2 st.sidebar.success(f"Uploaded dataset accepted. Rows: {len(df)}") # persist locally if possible try: df.to_excel(DATA_PATH, index=False, engine='openpyxl') except Exception: pass except Exception as e: st.sidebar.error(f"Could not read uploaded file: {e}") # ------------------------- # BERT loading (cached resource) # ------------------------- @st.cache_resource def load_bert(model_name="bert-base-uncased"): tok = AutoTokenizer.from_pretrained(model_name) m = AutoModel.from_pretrained(model_name) m.to(TORCH_DEVICE) m.eval() return tok, m # Attempt to load BERT lazily when needed bert_tokenizer = None bert_model = None if st.sidebar.checkbox("Load BERT now (optional, required for language detection)", value=False): with st.spinner("Loading BERT..."): try: bert_tokenizer, bert_model = load_bert("bert-base-uncased") st.sidebar.success("BERT loaded") except Exception as e: st.sidebar.error(f"BERT load failed: {e}") # ------------------------- # Embedding helper (hashable args) # ------------------------- @st.cache_data def compute_cls_embeddings_cached(texts_tuple, batch_size=16): # texts_tuple must be a tuple for caching if bert_tokenizer is None or bert_model is None: raise RuntimeError("BERT not loaded") texts = list(texts_tuple) if len(texts) == 0: return np.zeros((0, bert_model.config.hidden_size)) all_emb = [] with torch.no_grad(): for i in range(0, len(texts), batch_size): batch = texts[i:i+batch_size] encoded = bert_tokenizer(batch, padding=True, truncation=True, max_length=128, return_tensors="pt") input_ids = encoded['input_ids'].to(TORCH_DEVICE) attention_mask = encoded['attention_mask'].to(TORCH_DEVICE) outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask) cls = outputs.last_hidden_state[:, 0, :].cpu().numpy() all_emb.append(cls) return np.vstack(all_emb) def compute_cls_embeddings(texts, batch_size=16): # wrapper to avoid passing tokenizer/model into cached function if bert_tokenizer is None or bert_model is None: raise RuntimeError("BERT not loaded") # ensure it's tuple if isinstance(texts, str): texts = (texts,) elif isinstance(texts, list): texts = tuple(texts) return compute_cls_embeddings_cached(texts, batch_size=batch_size) # ------------------------- # Language MLP (train/load) # ------------------------- def train_language_mlp(df_local, save_path=LANG_MLP_PATH): if bert_tokenizer is None or bert_model is None: raise RuntimeError("Load BERT before training language classifier") le = LabelEncoder() y = le.fit_transform(df_local['language'].astype(str)) texts = df_local['original_text'].astype(str).tolist() emb = compute_cls_embeddings(texts, batch_size=16) if emb.shape[0] < 5: return None, le, 0.0 X_train, X_test, y_train, y_test = train_test_split(emb, y, test_size=0.15, random_state=42, stratify=y) clf = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=400, random_state=42) clf.fit(X_train, y_train) acc = accuracy_score(y_test, clf.predict(X_test)) # save try: with open(save_path, "wb") as f: pickle.dump({'clf': clf, 'le': le}, f) except Exception: pass return clf, le, acc def load_lang_clf_from_disk(path=LANG_MLP_PATH): if os.path.exists(path): try: with open(path, "rb") as f: data = pickle.load(f) return data.get('clf'), data.get('le') except Exception: return None, None return None, None lang_clf, lang_le = load_lang_clf_from_disk() lang_acc = 0.0 # If model exists but BERT not loaded, it's okay for UI; compute embeddings only when used if lang_clf is None and df is not None and st.sidebar.button("Train language classifier (BERT→MLP)"): if bert_tokenizer is None or bert_model is None: st.sidebar.error("Load BERT first (checkbox on sidebar).") else: with st.spinner("Training language classifier..."): try: lang_clf, lang_le, lang_acc = train_language_mlp(df) st.sidebar.success(f"Trained language classifier (acc: {lang_acc:.2%})") except Exception as e: st.sidebar.error(f"Training failed: {e}") if lang_clf is not None: st.sidebar.write("Language classifier: loaded") def predict_language(text): if not text or lang_clf is None: return "Unknown", 0.0 emb = compute_cls_embeddings([text], batch_size=16) probs = lang_clf.predict_proba(emb)[0] idx = int(np.argmax(probs)) lang = lang_le.inverse_transform([idx])[0] conf = float(probs[idx] * 100) return lang, conf # ------------------------- # Seq2Seq restoration utilities # ------------------------- def create_tokenizers_and_sequences(corrupted_texts, restored_texts, num_words_in=5000, num_words_out=5000, maxlen_in=50, maxlen_out=50): start_token = "" end_token = "" out_texts_with_tokens = [f"{start_token} {t} {end_token}" for t in restored_texts] in_tok = KerasTokenizer(num_words=num_words_in, oov_token="") out_tok = KerasTokenizer(num_words=num_words_out, oov_token="") in_tok.fit_on_texts(corrupted_texts) out_tok.fit_on_texts(out_texts_with_tokens) in_seq = in_tok.texts_to_sequences(corrupted_texts) out_seq = out_tok.texts_to_sequences(out_texts_with_tokens) encoder_input_data = pad_sequences(in_seq, maxlen=maxlen_in, padding='post') decoder_input_data = pad_sequences([s[:-1] for s in out_seq], maxlen=maxlen_out, padding='post') decoder_target_data = pad_sequences([s[1:] for s in out_seq], maxlen=maxlen_out, padding='post') return { "in_tok": in_tok, "out_tok": out_tok, "encoder_input_data": encoder_input_data, "decoder_input_data": decoder_input_data, "decoder_target_data": decoder_target_data, "maxlen_in": maxlen_in, "maxlen_out": maxlen_out } def build_seq2seq_model(vocab_in, vocab_out, embed_dim=128, latent_dim=256, maxlen_in=50, maxlen_out=50): encoder_inputs = Input(shape=(maxlen_in,), name='encoder_inputs') encoder_emb = Embedding(vocab_in, embed_dim, mask_zero=True, name='encoder_embedding')(encoder_inputs) encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm') _, state_h, state_c = encoder_lstm(encoder_emb) encoder_states = [state_h, state_c] decoder_inputs = Input(shape=(maxlen_out,), name='decoder_inputs') decoder_emb = Embedding(vocab_out, embed_dim, mask_zero=True, name='decoder_embedding')(decoder_inputs) decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm') decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_states) decoder_dense = Dense(vocab_out, activation='softmax', name='decoder_dense') decoder_outputs = decoder_dense(decoder_outputs) model = Model([encoder_inputs, decoder_inputs], decoder_outputs) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) return model # Try loading seq2seq saved files seq2seq_available = False seq2seq_model = None seq2seq_in_tok = None seq2seq_out_tok = None seq2seq_params = {} if os.path.exists(SEQ2SEQ_MODEL_PATH) and os.path.exists(SEQ2SEQ_TOKENIZERS_PATH): try: seq2seq_model = load_model(SEQ2SEQ_MODEL_PATH) with open(SEQ2SEQ_TOKENIZERS_PATH, "rb") as f: tokdata = pickle.load(f) seq2seq_in_tok = tokdata.get('in_tok') seq2seq_out_tok = tokdata.get('out_tok') seq2seq_params = tokdata.get('params', {}) seq2seq_available = True except Exception: seq2seq_available = False def train_seq2seq(corrupted_texts, restored_texts, save_model=True, num_words_in=5000, num_words_out=5000, maxlen_in=50, maxlen_out=50, epochs=10, batch_size=32): toks = create_tokenizers_and_sequences(corrupted_texts, restored_texts, num_words_in=num_words_in, num_words_out=num_words_out, maxlen_in=maxlen_in, maxlen_out=maxlen_out) in_tok = toks['in_tok']; out_tok = toks['out_tok'] enc_in = toks['encoder_input_data']; dec_in = toks['decoder_input_data']; dec_tgt = toks['decoder_target_data'] vocab_in = min(num_words_in, len(in_tok.word_index) + 1) vocab_out = min(num_words_out, len(out_tok.word_index) + 1) model = build_seq2seq_model(vocab_in, vocab_out, maxlen_in=maxlen_in, maxlen_out=maxlen_out) # dec_tgt needs shape (samples, timesteps, 1) for sparse_categorical_crossentropy if labels are integers history = model.fit([enc_in, dec_in], np.expand_dims(dec_tgt, -1), epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1) if save_model: try: model.save(SEQ2SEQ_MODEL_PATH) with open(SEQ2SEQ_TOKENIZERS_PATH, "wb") as f: pickle.dump({'in_tok': in_tok, 'out_tok': out_tok, 'params': {'maxlen_in': maxlen_in, 'maxlen_out': maxlen_out}}, f) except Exception: pass return model, in_tok, out_tok, history def seq2seq_infer(input_text): # dataset exact-match fallback handled outside; this is model inference only if not seq2seq_available or seq2seq_model is None or seq2seq_in_tok is None or seq2seq_out_tok is None: return "", 0.0 maxlen_in = seq2seq_params.get('maxlen_in', 50) maxlen_out = seq2seq_params.get('maxlen_out', 50) in_seq = seq2seq_in_tok.texts_to_sequences([input_text]) enc_input = pad_sequences(in_seq, maxlen=maxlen_in, padding='post') start_index = seq2seq_out_tok.word_index.get("") end_index = seq2seq_out_tok.word_index.get("") if start_index is None: return "", 0.0 cur_seq = [start_index] decoded = [] for _ in range(maxlen_out): dec_input = pad_sequences([cur_seq], maxlen=maxlen_out, padding='post') preds = seq2seq_model.predict([enc_input, dec_input], verbose=0) tok_probs = preds[0, len(cur_seq)-1] next_tok = int(np.argmax(tok_probs)) if next_tok == 0 or next_tok == end_index: break word = seq2seq_out_tok.index_word.get(next_tok) if word is None: break decoded.append(word) cur_seq.append(next_tok) restored = " ".join(decoded) if decoded else "" return restored, 70.0 # ------------------------- # Meaning interpreter (CountVectorizer -> MLP) # ------------------------- def train_meaning_interpreter(df_local, save_path=MEAN_MLP_PATH): vec = CountVectorizer(max_features=5000) X = vec.fit_transform(df_local['restored_text'].astype(str).tolist()) le = LabelEncoder() y = le.fit_transform(df_local['english_meaning'].astype(str)) if X.shape[0] < 5: return vec, None, le, 0.0 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y) clf = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=400, random_state=42) clf.fit(X_train, y_train) acc = accuracy_score(y_test, clf.predict(X_test)) try: with open(save_path, "wb") as f: pickle.dump({'vec': vec, 'clf': clf, 'le': le}, f) except Exception: pass return vec, clf, le, acc def load_meaning_from_disk(path=MEAN_MLP_PATH): if os.path.exists(path): try: with open(path, "rb") as f: data = pickle.load(f) return data.get('vec'), data.get('clf'), data.get('le') except Exception: return None, None, None return None, None, None vec_mean, meaning_clf, meaning_le = load_meaning_from_disk() meaning_acc = 0.0 if meaning_clf is None and df is not None and st.sidebar.button("Train meaning interpreter (CountVectorizer→MLP)"): with st.spinner("Training meaning interpreter..."): try: vec_mean, meaning_clf, meaning_le, meaning_acc = train_meaning_interpreter(df) st.sidebar.success(f"Trained meaning interpreter (acc: {meaning_acc:.2%})") except Exception as e: st.sidebar.error(f"Meaning training failed: {e}") if meaning_clf is not None: st.sidebar.write("Meaning interpreter: loaded") def predict_meaning(restored_text): if meaning_clf is None: return "", 0.0 Xv = vec_mean.transform([restored_text]) probs = meaning_clf.predict_proba(Xv)[0] idx = int(np.argmax(probs)) meaning = meaning_le.inverse_transform([idx])[0] conf = float(probs[idx] * 100) return meaning, conf # ------------------------- # Restoration wrapper (dataset match -> seq2seq -> fallback) # ------------------------- def restore_text(corrupt_text): if df is not None: match = df[df['corrupted_text'].str.lower() == str(corrupt_text).strip().lower()] if not match.empty: return match.iloc[0]['restored_text'], 99.0, True # seq2seq if seq2seq_available and seq2seq_model is not None: try: restored, conf = seq2seq_infer(corrupt_text) return restored, conf, False except Exception: pass # fallback placeholder return "", 0.0, False # ------------------------- # UI: Tabs # ------------------------- tabs = st.tabs(["Home", "Language Detection", "Restoration", "Meaning Interpreter", "File Evaluation", "Tokenizer", "About"]) # Home with tabs[0]: st.header("Welcome") st.markdown(""" This application provides a philological analysis tool: - **Language detection** (BERT CLS -> MLP) - **Restoration** (Seq2Seq LSTM) with exact-match dataset fallback - **Meaning interpreter** (CountVectorizer -> MLP) Use the sidebar to load/train components and the tabs to perform single/batch analysis. Use the top buttons for screenshots, recording and printing. """) # Language Detection Tab with tabs[1]: st.header("🔎 Language Detection") sample_text = st.text_area("Enter text to detect language", height=140, key="lang_text") col1, col2 = st.columns([1, 1]) with col1: if st.button("Detect Language"): if not sample_text.strip(): st.warning("Please enter text first.") else: if bert_tokenizer is None or bert_model is None: st.error("BERT not loaded. Check sidebar and load BERT.") elif lang_clf is None: st.error("Language classifier not trained. Train from sidebar.") else: with st.spinner("Predicting language..."): try: lang, conf = predict_language(sample_text) st.success(f"Predicted language: {lang} ({conf:.2f}%)") except Exception as e: st.error(f"Prediction failed: {e}") with col2: st.markdown("**Status**") st.write(f"BERT loaded: {'Yes' if bert_tokenizer is not None else 'No'}") st.write(f"Language classifier: {'Loaded' if lang_clf is not None else 'Not trained'}") # Restoration Tab with tabs[2]: st.markdown('

', unsafe_allow_html=True) st.header("🛠 Restoration") input_text = st.text_area("Enter corrupted or original text", height=140, key="restore_text") if st.button("Restore Text"): if not input_text.strip(): st.warning("Enter text first.") else: # predict language optionally try: lang_pred, lang_conf = ("Unknown", 0.0) if bert_tokenizer is not None and bert_model is not None and lang_clf is not None: lang_pred, lang_conf = predict_language(input_text) except Exception: lang_pred, lang_conf = ("Unknown", 0.0) restored_text, rest_conf, from_ds = restore_text(input_text) meaning_text, meaning_conf = predict_meaning(restored_text) st.success(f"Detected language: {lang_pred} ({lang_conf:.2f}%)") if from_ds: st.info("Restoration: exact match from dataset (high confidence).") else: st.info("Restoration: model-based result (verify).") st.write("**Restored Text:**") st.write(restored_text) st.write(f"Restoration Confidence: {rest_conf:.2f}%") st.write("**English Meaning:**") st.write(meaning_text) st.write(f"Meaning Confidence: {meaning_conf:.2f}%") # HTML block target for screenshot html_block = f'''

Translation Result

Detected Language: {lang_pred} ({lang_conf:.2f}%)

Restored Text: {restored_text}

Restoration Confidence: {rest_conf:.2f}%

English Meaning: {meaning_text}

Meaning Confidence: {meaning_conf:.2f}%

''' st.markdown(html_block, unsafe_allow_html=True) st.markdown('

', unsafe_allow_html=True) # Meaning Interpreter Tab with tabs[3]: st.header("🔠 Meaning Interpreter") restored_input = st.text_area("Enter restored text (or let Restoration produce it)", height=140, key="meaning_text") if st.button("Predict Meaning"): if not restored_input.strip(): st.warning("Enter restored text first.") else: meaning_pred, meaning_conf = predict_meaning(restored_input) st.success("Predicted English meaning:") st.write(meaning_pred) st.write(f"Confidence: {meaning_conf:.2f}%") # File Evaluation Tab with tabs[4]: st.header("📂 File Evaluation (batch)") uploaded_file = st.file_uploader("Upload CSV or XLSX (must have 'text' column)", type=["csv", "xlsx"]) batch_size = st.number_input("Embedding batch size (for language detection)", min_value=4, max_value=128, value=16, step=4) if uploaded_file: try: if uploaded_file.name.endswith(".csv"): batch_df = pd.read_csv(uploaded_file) else: batch_df = pd.read_excel(uploaded_file) except Exception as e: st.error(f"Could not read uploaded file: {e}") batch_df = None if batch_df is not None: if 'text' not in batch_df.columns: st.error("Uploaded file must contain a 'text' column.") else: if st.button("Process file"): texts = batch_df['text'].astype(str).tolist() n = len(texts) st.info(f"Processing {n} rows (batch_size={batch_size})") # Embeddings lang_preds = [] lang_confs = [] if bert_tokenizer is None or bert_model is None or lang_clf is None: st.warning("BERT or language classifier not ready — language columns will be 'Unknown'") lang_preds = ["Unknown"] * n lang_confs = [0.0] * n else: all_embs = [] for i in range(0, n, batch_size): chunk = texts[i:i+batch_size] enc_emb = compute_cls_embeddings(chunk, batch_size=batch_size) all_embs.append(enc_emb) st.write(f"Computed embeddings: {min(i+batch_size, n)}/{n}") Xemb = np.vstack(all_embs) probs = lang_clf.predict_proba(Xemb) idxs = np.argmax(probs, axis=1) lang_preds = lang_le.inverse_transform(idxs) lang_confs = (np.max(probs, axis=1) * 100).tolist() # restoration + meaning restored_list = [] rest_conf_list = [] meaning_list = [] meaning_conf_list = [] for t in texts: r, rc, _ = restore_text(t) restored_list.append(r) rest_conf_list.append(rc) m, mc = predict_meaning(r) meaning_list.append(m) meaning_conf_list.append(mc) batch_df['Detected_Language'] = lang_preds batch_df['Language_Confidence'] = lang_confs batch_df['Restored_Text'] = restored_list batch_df['Restoration_Confidence'] = rest_conf_list batch_df['English_Meaning'] = meaning_list batch_df['Meaning_Confidence'] = meaning_conf_list st.dataframe(batch_df.head(300)) # download towrite = io.BytesIO() try: batch_df.to_excel(towrite, index=False, engine='openpyxl') towrite.seek(0) st.download_button("⬇️ Download results (xlsx)", data=towrite, file_name="batch_results.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") except Exception: st.download_button("⬇️ Download CSV", data=batch_df.to_csv(index=False).encode('utf-8'), file_name="batch_results.csv") # Tokenizer Tab with tabs[5]: st.header("🧩 Tokenizer / Inspect") txt_token = st.text_area("Enter text to visualize tokenizer tokens", height=140, key="token_text") if st.button("Show tokens"): if bert_tokenizer is None: st.error("BERT tokenizer not loaded. Load BERT via sidebar.") else: tokens = bert_tokenizer.tokenize(txt_token) st.write(tokens) ids = bert_tokenizer.encode(txt_token) st.write("Token IDs:", ids) # About with tabs[6]: st.header("ℹ️ About") st.markdown(""" **Philological AI Ensemble** - Language detection: BERT CLS embeddings -> MLP classifier - Restoration: Seq2Seq encoder-decoder LSTM (dataset exact-match fallback) - Meaning interpreter: CountVectorizer -> MLP classifier - Features: batch processing, model saving/loading, screenshots, screen recording, print. """) st.markdown("**Notes:** Keep `Philological_7525.xlsx` in the repo root for automatic load. Use the sidebar to upload/replace dataset or to trigger training of components.") # ------------------------- # Extra: Train seq2seq via UI (safe) # ------------------------- st.sidebar.markdown("---") st.sidebar.header("Training Controls") if st.sidebar.checkbox("Show training utilities", value=False): st.sidebar.subheader("Seq2Seq training") if df is None: st.sidebar.info("Upload dataset to enable seq2seq training.") else: max_rows = st.sidebar.number_input("Max rows to use for seq2seq (balanced)", min_value=100, max_value=len(df), value=min(2000, len(df)), step=100) seq_epochs = st.sidebar.number_input("Seq2Seq epochs", min_value=1, max_value=100, value=5) seq_batch = st.sidebar.number_input("Seq2Seq batch size", min_value=8, max_value=512, value=64) if st.sidebar.button("Start Seq2Seq training"): corrupted = df['corrupted_text'].astype(str).tolist()[:max_rows] restored = df['restored_text'].astype(str).tolist()[:max_rows] with st.spinner("Training seq2seq (this may take long)..."): try: model, in_tok, out_tok, history = train_seq2seq(corrupted, restored, epochs=seq_epochs, batch_size=seq_batch, save_model=True) st.sidebar.success("Seq2Seq training finished and saved.") # reload try: seq2seq_model = load_model(SEQ2SEQ_MODEL_PATH) with open(SEQ2SEQ_TOKENIZERS_PATH, "rb") as f: tokdata = pickle.load(f) seq2seq_in_tok = tokdata.get('in_tok') seq2seq_out_tok = tokdata.get('out_tok') seq2seq_params = tokdata.get('params', {}) seq2seq_available = True except Exception: seq2seq_available = False except Exception as e: st.sidebar.error(f"Seq2Seq training failed: {e}") st.sidebar.subheader("Language interpreter training") if st.sidebar.button("Train meaning interpreter on full dataset"): if df is None: st.sidebar.error("Upload dataset first.") else: with st.spinner("Training meaning interpreter..."): try: vec_mean, meaning_clf, meaning_le, meaning_acc = train_meaning_interpreter(df) st.sidebar.success(f"Meaning interpreter trained (acc: {meaning_acc:.2%})") except Exception as e: st.sidebar.error(f"Meaning training failed: {e}") # End of app