', unsafe_allow_html=True)
st.header("🛠 Restoration")
input_text = st.text_area("Enter corrupted or original text", height=140, key="restore_text")
if st.button("Restore Text"):
if not input_text.strip():
st.warning("Enter text first.")
else:
# predict language optionally
try:
lang_pred, lang_conf = ("Unknown", 0.0)
if bert_tokenizer is not None and bert_model is not None and lang_clf is not None:
lang_pred, lang_conf = predict_language(input_text)
except Exception:
lang_pred, lang_conf = ("Unknown", 0.0)
restored_text, rest_conf, from_ds = restore_text(input_text)
meaning_text, meaning_conf = predict_meaning(restored_text)
st.success(f"Detected language: {lang_pred} ({lang_conf:.2f}%)")
if from_ds:
st.info("Restoration: exact match from dataset (high confidence).")
else:
st.info("Restoration: model-based result (verify).")
st.write("**Restored Text:**")
st.write(restored_text)
st.write(f"Restoration Confidence: {rest_conf:.2f}%")
st.write("**English Meaning:**")
st.write(meaning_text)
st.write(f"Meaning Confidence: {meaning_conf:.2f}%")
# HTML block target for screenshot
html_block = f'''
Translation Result
Detected Language: {lang_pred} ({lang_conf:.2f}%)
Restored Text: {restored_text}
Restoration Confidence: {rest_conf:.2f}%
English Meaning: {meaning_text}
Meaning Confidence: {meaning_conf:.2f}%
'''
st.markdown(html_block, unsafe_allow_html=True)
st.markdown('
', unsafe_allow_html=True)
# Meaning Interpreter Tab
with tabs[3]:
st.header("🔠 Meaning Interpreter")
restored_input = st.text_area("Enter restored text (or let Restoration produce it)", height=140, key="meaning_text")
if st.button("Predict Meaning"):
if not restored_input.strip():
st.warning("Enter restored text first.")
else:
meaning_pred, meaning_conf = predict_meaning(restored_input)
st.success("Predicted English meaning:")
st.write(meaning_pred)
st.write(f"Confidence: {meaning_conf:.2f}%")
# File Evaluation Tab
with tabs[4]:
st.header("📂 File Evaluation (batch)")
uploaded_file = st.file_uploader("Upload CSV or XLSX (must have 'text' column)", type=["csv", "xlsx"])
batch_size = st.number_input("Embedding batch size (for language detection)", min_value=4, max_value=128, value=16, step=4)
if uploaded_file:
try:
if uploaded_file.name.endswith(".csv"):
batch_df = pd.read_csv(uploaded_file)
else:
batch_df = pd.read_excel(uploaded_file)
except Exception as e:
st.error(f"Could not read uploaded file: {e}")
batch_df = None
if batch_df is not None:
if 'text' not in batch_df.columns:
st.error("Uploaded file must contain a 'text' column.")
else:
if st.button("Process file"):
texts = batch_df['text'].astype(str).tolist()
n = len(texts)
st.info(f"Processing {n} rows (batch_size={batch_size})")
# Embeddings
lang_preds = []
lang_confs = []
if bert_tokenizer is None or bert_model is None or lang_clf is None:
st.warning("BERT or language classifier not ready — language columns will be 'Unknown'")
lang_preds = ["Unknown"] * n
lang_confs = [0.0] * n
else:
all_embs = []
for i in range(0, n, batch_size):
chunk = texts[i:i+batch_size]
enc_emb = compute_cls_embeddings(chunk, batch_size=batch_size)
all_embs.append(enc_emb)
st.write(f"Computed embeddings: {min(i+batch_size, n)}/{n}")
Xemb = np.vstack(all_embs)
probs = lang_clf.predict_proba(Xemb)
idxs = np.argmax(probs, axis=1)
lang_preds = lang_le.inverse_transform(idxs)
lang_confs = (np.max(probs, axis=1) * 100).tolist()
# restoration + meaning
restored_list = []
rest_conf_list = []
meaning_list = []
meaning_conf_list = []
for t in texts:
r, rc, _ = restore_text(t)
restored_list.append(r)
rest_conf_list.append(rc)
m, mc = predict_meaning(r)
meaning_list.append(m)
meaning_conf_list.append(mc)
batch_df['Detected_Language'] = lang_preds
batch_df['Language_Confidence'] = lang_confs
batch_df['Restored_Text'] = restored_list
batch_df['Restoration_Confidence'] = rest_conf_list
batch_df['English_Meaning'] = meaning_list
batch_df['Meaning_Confidence'] = meaning_conf_list
st.dataframe(batch_df.head(300))
# download
towrite = io.BytesIO()
try:
batch_df.to_excel(towrite, index=False, engine='openpyxl')
towrite.seek(0)
st.download_button("⬇️ Download results (xlsx)", data=towrite,
file_name="batch_results.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
except Exception:
st.download_button("⬇️ Download CSV", data=batch_df.to_csv(index=False).encode('utf-8'),
file_name="batch_results.csv")
# Tokenizer Tab
with tabs[5]:
st.header("🧩 Tokenizer / Inspect")
txt_token = st.text_area("Enter text to visualize tokenizer tokens", height=140, key="token_text")
if st.button("Show tokens"):
if bert_tokenizer is None:
st.error("BERT tokenizer not loaded. Load BERT via sidebar.")
else:
tokens = bert_tokenizer.tokenize(txt_token)
st.write(tokens)
ids = bert_tokenizer.encode(txt_token)
st.write("Token IDs:", ids)
# About
with tabs[6]:
st.header("ℹ️ About")
st.markdown("""
**Philological AI Ensemble**
- Language detection: BERT CLS embeddings -> MLP classifier
- Restoration: Seq2Seq encoder-decoder LSTM (dataset exact-match fallback)
- Meaning interpreter: CountVectorizer -> MLP classifier
- Features: batch processing, model saving/loading, screenshots, screen recording, print.
""")
st.markdown("**Notes:** Keep `Philological_7525.xlsx` in the repo root for automatic load. Use the sidebar to upload/replace dataset or to trigger training of components.")
# -------------------------
# Extra: Train seq2seq via UI (safe)
# -------------------------
st.sidebar.markdown("---")
st.sidebar.header("Training Controls")
if st.sidebar.checkbox("Show training utilities", value=False):
st.sidebar.subheader("Seq2Seq training")
if df is None:
st.sidebar.info("Upload dataset to enable seq2seq training.")
else:
max_rows = st.sidebar.number_input("Max rows to use for seq2seq (balanced)", min_value=100, max_value=len(df), value=min(2000, len(df)), step=100)
seq_epochs = st.sidebar.number_input("Seq2Seq epochs", min_value=1, max_value=100, value=5)
seq_batch = st.sidebar.number_input("Seq2Seq batch size", min_value=8, max_value=512, value=64)
if st.sidebar.button("Start Seq2Seq training"):
corrupted = df['corrupted_text'].astype(str).tolist()[:max_rows]
restored = df['restored_text'].astype(str).tolist()[:max_rows]
with st.spinner("Training seq2seq (this may take long)..."):
try:
model, in_tok, out_tok, history = train_seq2seq(corrupted, restored,
epochs=seq_epochs, batch_size=seq_batch,
save_model=True)
st.sidebar.success("Seq2Seq training finished and saved.")
# reload
try:
seq2seq_model = load_model(SEQ2SEQ_MODEL_PATH)
with open(SEQ2SEQ_TOKENIZERS_PATH, "rb") as f:
tokdata = pickle.load(f)
seq2seq_in_tok = tokdata.get('in_tok')
seq2seq_out_tok = tokdata.get('out_tok')
seq2seq_params = tokdata.get('params', {})
seq2seq_available = True
except Exception:
seq2seq_available = False
except Exception as e:
st.sidebar.error(f"Seq2Seq training failed: {e}")
st.sidebar.subheader("Language interpreter training")
if st.sidebar.button("Train meaning interpreter on full dataset"):
if df is None:
st.sidebar.error("Upload dataset first.")
else:
with st.spinner("Training meaning interpreter..."):
try:
vec_mean, meaning_clf, meaning_le, meaning_acc = train_meaning_interpreter(df)
st.sidebar.success(f"Meaning interpreter trained (acc: {meaning_acc:.2%})")
except Exception as e:
st.sidebar.error(f"Meaning training failed: {e}")
# End of app