|
|
|
|
|
""" |
|
|
|
|
|
Uses unified pipeline for both text and document processing |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
from datetime import datetime |
|
|
import os |
|
|
from typing import List, Dict, Any, Tuple, Optional |
|
|
import tempfile |
|
|
import logging |
|
|
import sys |
|
|
import subprocess |
|
|
import platform |
|
|
|
|
|
from unified_pipeline import process_text_content, process_document_content |
|
|
from scraper_common import scrape_news_async, set_scraping_cancelled, force_close_browser, scraping_cancelled |
|
|
from auth import auth_manager |
|
|
|
|
|
import os, glob, subprocess, pathlib |
|
|
|
|
|
|
|
|
import os, glob, subprocess |
|
|
|
|
|
|
|
|
import os |
|
|
import subprocess |
|
|
|
|
|
|
|
|
os.environ["PLAYWRIGHT_BROWSERS_PATH"] = "/root/.cache/ms-playwright" |
|
|
|
|
|
|
|
|
def ensure_chromium(): |
|
|
try: |
|
|
subprocess.run( |
|
|
["playwright", "install", "--with-deps", "chromium"], |
|
|
check=True |
|
|
) |
|
|
except Exception as e: |
|
|
print("Playwright install failed:", e) |
|
|
|
|
|
ensure_chromium() |
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.DEBUG, |
|
|
format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s', |
|
|
handlers=[ |
|
|
logging.StreamHandler(sys.stdout) |
|
|
] |
|
|
) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
document_processing_cancelled = False |
|
|
|
|
|
|
|
|
current_user = None |
|
|
current_session = None |
|
|
|
|
|
def clear_memory_state(): |
|
|
""" |
|
|
Clear all memory state and global variables to free up memory |
|
|
This function should be called before starting new processing operations |
|
|
""" |
|
|
global document_processing_cancelled |
|
|
|
|
|
logger.info("🧹 Clearing memory state...") |
|
|
|
|
|
|
|
|
document_processing_cancelled = False |
|
|
set_scraping_cancelled(False) |
|
|
|
|
|
|
|
|
from scraper_common import reset_global_pdf_count |
|
|
reset_global_pdf_count() |
|
|
|
|
|
|
|
|
from scraper_common import TIMEOUT_URLS |
|
|
TIMEOUT_URLS.clear() |
|
|
|
|
|
|
|
|
try: |
|
|
import asyncio |
|
|
import threading |
|
|
|
|
|
def close_browser_async(): |
|
|
try: |
|
|
loop = asyncio.new_event_loop() |
|
|
asyncio.set_event_loop(loop) |
|
|
loop.run_until_complete(force_close_browser()) |
|
|
loop.close() |
|
|
except Exception as e: |
|
|
logger.debug(f"Browser already closed or error closing: {e}") |
|
|
|
|
|
|
|
|
browser_close_thread = threading.Thread(target=close_browser_async) |
|
|
browser_close_thread.start() |
|
|
except Exception as e: |
|
|
logger.debug(f"Error closing browser during memory clear: {e}") |
|
|
|
|
|
|
|
|
try: |
|
|
from unified_pipeline import get_pipeline |
|
|
pipeline = get_pipeline() |
|
|
if pipeline: |
|
|
pipeline.reset_stats() |
|
|
logger.debug("Pipeline statistics reset") |
|
|
except Exception as e: |
|
|
logger.debug(f"Error resetting pipeline stats: {e}") |
|
|
|
|
|
|
|
|
import gc |
|
|
gc.collect() |
|
|
|
|
|
logger.info("✅ Memory state cleared successfully") |
|
|
|
|
|
|
|
|
def login_user(username: str, password: str) -> Tuple[bool, str]: |
|
|
"""Login user and return (success, message)""" |
|
|
global current_user, current_session |
|
|
|
|
|
success, session_token = auth_manager.authenticate_user(username, password) |
|
|
if success: |
|
|
current_user = username |
|
|
current_session = session_token |
|
|
return True, f"Welcome, {username}!" |
|
|
else: |
|
|
return False, "Invalid username or password" |
|
|
|
|
|
def logout_user() -> str: |
|
|
"""Logout current user""" |
|
|
global current_user, current_session |
|
|
|
|
|
if current_session: |
|
|
auth_manager.logout_user(current_session) |
|
|
|
|
|
current_user = None |
|
|
current_session = None |
|
|
return "Logged out successfully" |
|
|
|
|
|
def is_authenticated() -> bool: |
|
|
"""Check if user is authenticated""" |
|
|
global current_user, current_session |
|
|
|
|
|
if not current_user or not current_session: |
|
|
return False |
|
|
|
|
|
|
|
|
valid, username = auth_manager.validate_session(current_session) |
|
|
if not valid: |
|
|
current_user = None |
|
|
current_session = None |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
def get_current_user() -> Optional[str]: |
|
|
"""Get current authenticated user""" |
|
|
if is_authenticated(): |
|
|
return current_user |
|
|
return None |
|
|
|
|
|
def require_auth(func): |
|
|
"""Decorator to require authentication for functions""" |
|
|
def wrapper(*args, **kwargs): |
|
|
if not is_authenticated(): |
|
|
return None, "Please login to access this feature" |
|
|
return func(*args, **kwargs) |
|
|
return wrapper |
|
|
|
|
|
|
|
|
def ensure_archive_directory(): |
|
|
"""Ensure archive directory exists""" |
|
|
archive_dir = "archive" |
|
|
if not os.path.exists(archive_dir): |
|
|
os.makedirs(archive_dir) |
|
|
logger.info(f"📁 Created archive directory: {archive_dir}") |
|
|
return archive_dir |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_csv_download(df: pd.DataFrame, filename_prefix: str = "data") -> str: |
|
|
""" |
|
|
Create a CSV file from DataFrame and return the file path |
|
|
""" |
|
|
if df.empty: |
|
|
|
|
|
empty_df = pd.DataFrame(columns=df.columns if not df.empty else ['#', 'title', 'content', 'summary', 'summary_somali', 'date', 'url']) |
|
|
csv_content = empty_df.to_csv(index=False) |
|
|
else: |
|
|
csv_content = df.to_csv(index=False) |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
filename = f"{filename_prefix}_{timestamp}.csv" |
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f: |
|
|
f.write(csv_content) |
|
|
temp_path = f.name |
|
|
|
|
|
return temp_path |
|
|
|
|
|
|
|
|
def save_csv_to_archive(df: pd.DataFrame, source: str, filename_prefix: str = "data") -> str: |
|
|
""" |
|
|
Save CSV file to archive folder organized by source + date |
|
|
""" |
|
|
|
|
|
today = datetime.now().strftime("%Y-%m-%d") |
|
|
archive_dir = os.path.join("archive", source, today) |
|
|
os.makedirs(archive_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
filename = f"{filename_prefix}_{timestamp}.csv" |
|
|
csv_path = os.path.join(archive_dir, filename) |
|
|
|
|
|
if df.empty: |
|
|
|
|
|
empty_df = pd.DataFrame(columns=df.columns if not df.empty else ['#', 'title', 'content', 'summary', 'summary_somali', 'date', 'url']) |
|
|
empty_df.to_csv(csv_path, index=False) |
|
|
else: |
|
|
df.to_csv(csv_path, index=False) |
|
|
|
|
|
return csv_path |
|
|
|
|
|
|
|
|
def create_text_content_tab(): |
|
|
""" |
|
|
Create the text content tab interface |
|
|
""" |
|
|
with gr.Tab("Text Content"): |
|
|
gr.Markdown("## Website Content Scraper") |
|
|
gr.Markdown("Extract and analyze content from websites with AI-powered summarization.") |
|
|
|
|
|
with gr.Group(): |
|
|
gr.Markdown("### Configuration") |
|
|
with gr.Row(): |
|
|
url_input = gr.Textbox( |
|
|
label="Website URL", |
|
|
placeholder="https://example.com/article", |
|
|
interactive=True, |
|
|
scale=2 |
|
|
) |
|
|
keywords_input = gr.Textbox( |
|
|
label="Filter Keywords (optional)", |
|
|
placeholder="e.g., flood, drought, conflict (comma-separated)", |
|
|
interactive=True, |
|
|
scale=2 |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
start_date_input = gr.Textbox( |
|
|
label="Start Date (optional)", |
|
|
placeholder="YYYY-MM-DD (e.g., 2024-01-01)", |
|
|
interactive=True, |
|
|
scale=1, |
|
|
info="Filter articles from this date onwards" |
|
|
) |
|
|
end_date_input = gr.Textbox( |
|
|
label="End Date (optional)", |
|
|
placeholder="YYYY-MM-DD (e.g., 2024-12-31)", |
|
|
interactive=True, |
|
|
scale=1, |
|
|
info="Filter articles up to this date" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
scrape_btn = gr.Button("Scrape Content", variant="primary") |
|
|
cancel_btn = gr.Button("Cancel", variant="stop", interactive=True, value="Cancel") |
|
|
clear_btn = gr.Button("Clear", variant="secondary") |
|
|
|
|
|
|
|
|
status_text = gr.Textbox( |
|
|
label="Status", |
|
|
value="Ready to scrape content...", |
|
|
interactive=False, |
|
|
visible=True |
|
|
) |
|
|
|
|
|
|
|
|
content_df = gr.Dataframe( |
|
|
label="Scraped Content", |
|
|
headers=["#", "Title", "Category", "Content", "Summary", "Summary (Somali)", "Date", "URL"], |
|
|
datatype=["str", "str", "str", "str", "str", "str", "str", "str"], |
|
|
interactive=True, |
|
|
wrap=True |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
download_btn = gr.DownloadButton( |
|
|
label="📥 Download CSV", |
|
|
variant="secondary", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
|
|
|
full_content_store = gr.State([]) |
|
|
|
|
|
def process_and_display(url, custom_keywords="", start_date="", end_date=""): |
|
|
"""Process URL and display results with progress updates""" |
|
|
|
|
|
clear_memory_state() |
|
|
|
|
|
from scraper_common import clear_captcha_status |
|
|
clear_captcha_status() |
|
|
|
|
|
logger.info(f"🚀 Starting text content processing for URL: {url}") |
|
|
logger.info(f"🔑 Custom keywords provided: {custom_keywords}") |
|
|
logger.debug(f"📋 Processing parameters: URL={url.strip()}") |
|
|
|
|
|
if not url.strip(): |
|
|
logger.warning("⚠️ Empty URL provided") |
|
|
return pd.DataFrame(), None, "❌ Error: Please enter a valid URL", [] |
|
|
|
|
|
try: |
|
|
import asyncio |
|
|
import threading |
|
|
import time |
|
|
|
|
|
|
|
|
from unified_pipeline import determine_website_type |
|
|
website_type = determine_website_type(url.strip()) |
|
|
|
|
|
|
|
|
if scraping_cancelled(): |
|
|
logger.warning("⚠️ Operation cancelled before starting") |
|
|
return pd.DataFrame(), None, "🛑 Operation cancelled by user", [] |
|
|
|
|
|
|
|
|
status_msg = f"📡 Step 1/4: Starting content extraction from {website_type}..." |
|
|
yield pd.DataFrame(), None, status_msg, [] |
|
|
|
|
|
if scraping_cancelled(): |
|
|
logger.warning("⚠️ Operation cancelled by user before content extraction") |
|
|
return pd.DataFrame(), None, "🛑 Operation cancelled by user", [] |
|
|
|
|
|
|
|
|
result_container = { |
|
|
'df': None, |
|
|
'full_content_data': None, |
|
|
'error': None, |
|
|
'completed': False, |
|
|
'status': 'processing' |
|
|
} |
|
|
|
|
|
def run_async_processing(): |
|
|
"""Run the async processing in a separate thread""" |
|
|
try: |
|
|
result_container['status'] = 'scraping' |
|
|
loop = asyncio.new_event_loop() |
|
|
asyncio.set_event_loop(loop) |
|
|
df, full_content_data = loop.run_until_complete(process_text_content(url.strip(), custom_keywords, start_date.strip() if start_date else None, end_date.strip() if end_date else None)) |
|
|
result_container['df'] = df |
|
|
result_container['full_content_data'] = full_content_data |
|
|
result_container['status'] = 'completed' |
|
|
result_container['completed'] = True |
|
|
except Exception as e: |
|
|
result_container['error'] = str(e) |
|
|
result_container['status'] = 'error' |
|
|
result_container['completed'] = True |
|
|
finally: |
|
|
loop.close() |
|
|
|
|
|
|
|
|
processing_thread = threading.Thread(target=run_async_processing) |
|
|
processing_thread.start() |
|
|
|
|
|
|
|
|
status_step = 1 |
|
|
last_status_time = time.time() |
|
|
|
|
|
while processing_thread.is_alive(): |
|
|
if scraping_cancelled(): |
|
|
logger.warning("⚠️ Operation cancelled during processing") |
|
|
try: |
|
|
loop = asyncio.new_event_loop() |
|
|
asyncio.set_event_loop(loop) |
|
|
loop.run_until_complete(force_close_browser()) |
|
|
loop.close() |
|
|
except Exception as e: |
|
|
logger.error(f"Error closing browser: {e}") |
|
|
return pd.DataFrame(), None, "🛑 Operation cancelled by user", [] |
|
|
|
|
|
|
|
|
from scraper_common import get_captcha_status |
|
|
captcha_status = get_captcha_status() |
|
|
if captcha_status: |
|
|
yield pd.DataFrame(), None, captcha_status, [] |
|
|
time.sleep(0.5) |
|
|
continue |
|
|
|
|
|
|
|
|
current_time = time.time() |
|
|
if current_time - last_status_time >= 2.0: |
|
|
if status_step == 1: |
|
|
status_msg = "🔄 Step 2/4: Extracting content from website..." |
|
|
yield pd.DataFrame(), None, status_msg, [] |
|
|
status_step = 2 |
|
|
last_status_time = current_time |
|
|
elif status_step == 2: |
|
|
status_msg = "🤖 Step 3/4: Processing content with AI models..." |
|
|
yield pd.DataFrame(), None, status_msg, [] |
|
|
status_step = 3 |
|
|
last_status_time = current_time |
|
|
|
|
|
time.sleep(0.5) |
|
|
|
|
|
|
|
|
if result_container['error']: |
|
|
logger.error(f"❌ Error during processing: {result_container['error']}") |
|
|
return pd.DataFrame(), None, f"❌ Error: {result_container['error']}", [] |
|
|
|
|
|
df = result_container['df'] |
|
|
full_content_data = result_container['full_content_data'] |
|
|
|
|
|
|
|
|
if scraping_cancelled(): |
|
|
logger.warning("⚠️ Operation cancelled by user after content extraction") |
|
|
return pd.DataFrame(), None, "🛑 Operation cancelled by user", [] |
|
|
|
|
|
|
|
|
num_articles = len(df) if df is not None and not df.empty else 0 |
|
|
status_msg = f"💾 Step 4/4: Saving to archive... Found {num_articles} articles" |
|
|
yield pd.DataFrame(), None, status_msg, [] |
|
|
|
|
|
if scraping_cancelled(): |
|
|
logger.warning("⚠️ Operation cancelled by user during archiving") |
|
|
return pd.DataFrame(), None, "🛑 Operation cancelled by user", [] |
|
|
|
|
|
|
|
|
if not df.empty: |
|
|
try: |
|
|
source = url.split('/')[2].replace('www.', '') if '://' in url else 'unknown' |
|
|
archive_path = save_csv_to_archive(df, source, "scraped_content") |
|
|
logger.info(f"📁 Saved to archive: {archive_path}") |
|
|
except Exception as e: |
|
|
logger.error(f"❌ Error saving to archive: {str(e)}") |
|
|
|
|
|
csv_file = create_csv_download(df, "scraped_content") if not df.empty else None |
|
|
|
|
|
|
|
|
if scraping_cancelled(): |
|
|
logger.warning("⚠️ Operation cancelled by user before finalizing results") |
|
|
return pd.DataFrame(), None, "🛑 Operation cancelled by user", [] |
|
|
|
|
|
|
|
|
logger.info(f"✅ Processing complete! Found {len(df)} articles.") |
|
|
final_status = f"✅ Processing complete! Found {len(df)} articles." |
|
|
yield df, csv_file, final_status, full_content_data |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
logger.error(f"❌ Error during text content processing: {str(e)}") |
|
|
logger.debug(f"🔍 Error details: {type(e).__name__}: {str(e)}") |
|
|
return pd.DataFrame(), None, f"Error: {str(e)}", [] |
|
|
|
|
|
def cancel_scraping(): |
|
|
"""Cancel the scraping operation""" |
|
|
logger.warning("⚠️ User requested cancellation of scraping operation") |
|
|
|
|
|
clear_memory_state() |
|
|
logger.info("🛑 Set cancellation flags") |
|
|
|
|
|
|
|
|
import threading |
|
|
def close_browser_async(): |
|
|
import asyncio |
|
|
try: |
|
|
logger.info("🔧 Attempting to close browser...") |
|
|
loop = asyncio.new_event_loop() |
|
|
asyncio.set_event_loop(loop) |
|
|
loop.run_until_complete(force_close_browser()) |
|
|
loop.close() |
|
|
logger.info("✅ Browser closed successfully") |
|
|
except Exception as e: |
|
|
logger.error(f"❌ Error closing browser: {e}") |
|
|
|
|
|
|
|
|
browser_close_thread = threading.Thread(target=close_browser_async) |
|
|
browser_close_thread.start() |
|
|
|
|
|
return "🛑 Cancellation requested - stopping operation..." |
|
|
|
|
|
def clear_all(): |
|
|
"""Clear URL input, keywords input, date inputs, DataFrame, and download button""" |
|
|
logger.info("🧹 User requested to clear all data") |
|
|
|
|
|
clear_memory_state() |
|
|
return "", "", "", "", pd.DataFrame(), None, "Ready to scrape content...", [] |
|
|
|
|
|
def update_download_visibility(df): |
|
|
return gr.DownloadButton(visible=not df.empty) |
|
|
|
|
|
scrape_btn.click( |
|
|
fn=process_and_display, |
|
|
inputs=[url_input, keywords_input, start_date_input, end_date_input], |
|
|
outputs=[content_df, download_btn, status_text, full_content_store], |
|
|
show_progress=True |
|
|
) |
|
|
|
|
|
cancel_btn.click( |
|
|
fn=cancel_scraping, |
|
|
outputs=[status_text] |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
fn=clear_all, |
|
|
outputs=[url_input, keywords_input, start_date_input, end_date_input, content_df, download_btn, status_text, full_content_store] |
|
|
) |
|
|
|
|
|
content_df.change( |
|
|
fn=update_download_visibility, |
|
|
inputs=[content_df], |
|
|
outputs=[download_btn] |
|
|
) |
|
|
|
|
|
|
|
|
def create_document_content_tab(): |
|
|
""" |
|
|
Create the document content tab interface |
|
|
""" |
|
|
with gr.Tab("Document Content"): |
|
|
gr.Markdown("## Document Content Processor") |
|
|
gr.Markdown("Extract and analyze content from PDF, DOC, and CSV documents with AI-powered processing.") |
|
|
|
|
|
with gr.Group(): |
|
|
gr.Markdown("### Document Source") |
|
|
with gr.Row(): |
|
|
doc_url_input = gr.Textbox( |
|
|
label="Document URL", |
|
|
placeholder="https://example.com/documents/", |
|
|
interactive=True, |
|
|
scale=2 |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
doc_start_date_input = gr.Textbox( |
|
|
label="Start Date (optional)", |
|
|
placeholder="YYYY-MM-DD (e.g., 2024-01-01)", |
|
|
interactive=True, |
|
|
scale=1, |
|
|
info="Filter documents from this date onwards" |
|
|
) |
|
|
doc_end_date_input = gr.Textbox( |
|
|
label="End Date (optional)", |
|
|
placeholder="YYYY-MM-DD (e.g., 2024-12-31)", |
|
|
interactive=True, |
|
|
scale=1, |
|
|
info="Filter documents up to this date" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
process_btn = gr.Button("Process Documents", variant="primary") |
|
|
doc_cancel_btn = gr.Button("Cancel", variant="stop", interactive=True, value="Cancel") |
|
|
doc_clear_btn = gr.Button("Clear", variant="secondary") |
|
|
|
|
|
|
|
|
doc_status_text = gr.Textbox( |
|
|
label="Status", |
|
|
value="Ready to process documents...", |
|
|
interactive=False, |
|
|
visible=True |
|
|
) |
|
|
|
|
|
|
|
|
doc_df = gr.Dataframe( |
|
|
label="Document Content", |
|
|
headers=["Title", "Date", "Source", "File Path", "Extracted Text", "Summary", "Summary (Somali)", "File Type"], |
|
|
datatype=["str", "str", "str", "str", "str", "str", "str", "str"], |
|
|
interactive=True, |
|
|
wrap=True |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
doc_download_btn = gr.DownloadButton( |
|
|
label="📥 Download CSV", |
|
|
variant="secondary", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
def process_and_display_docs(url, start_date="", end_date=""): |
|
|
"""Process documents and display results with progress updates""" |
|
|
|
|
|
clear_memory_state() |
|
|
|
|
|
from scraper_common import clear_captcha_status |
|
|
clear_captcha_status() |
|
|
|
|
|
if not url.strip(): |
|
|
return pd.DataFrame(), None, "❌ Error: Please enter a valid URL" |
|
|
|
|
|
try: |
|
|
import asyncio |
|
|
import threading |
|
|
import time |
|
|
|
|
|
|
|
|
from unified_pipeline import determine_website_type |
|
|
website_type = determine_website_type(url.strip()) |
|
|
|
|
|
|
|
|
if document_processing_cancelled: |
|
|
return pd.DataFrame(), None, "🛑 Operation cancelled by user" |
|
|
|
|
|
|
|
|
status_msg = f"📄 Step 1/4: Starting document extraction from {website_type}..." |
|
|
yield pd.DataFrame(), None, status_msg |
|
|
|
|
|
if document_processing_cancelled: |
|
|
return pd.DataFrame(), None, "🛑 Operation cancelled by user" |
|
|
|
|
|
|
|
|
result_container = { |
|
|
'df': None, |
|
|
'error': None, |
|
|
'completed': False, |
|
|
'status': 'processing' |
|
|
} |
|
|
|
|
|
def run_async_processing(): |
|
|
"""Run the async processing in a separate thread""" |
|
|
try: |
|
|
result_container['status'] = 'extracting' |
|
|
loop = asyncio.new_event_loop() |
|
|
asyncio.set_event_loop(loop) |
|
|
df = loop.run_until_complete(process_document_content(url.strip(), start_date.strip() if start_date else None, end_date.strip() if end_date else None)) |
|
|
result_container['df'] = df |
|
|
result_container['status'] = 'completed' |
|
|
result_container['completed'] = True |
|
|
except Exception as e: |
|
|
result_container['error'] = str(e) |
|
|
result_container['status'] = 'error' |
|
|
result_container['completed'] = True |
|
|
finally: |
|
|
loop.close() |
|
|
|
|
|
|
|
|
processing_thread = threading.Thread(target=run_async_processing) |
|
|
processing_thread.start() |
|
|
|
|
|
|
|
|
status_step = 1 |
|
|
last_status_time = time.time() |
|
|
|
|
|
while processing_thread.is_alive(): |
|
|
if document_processing_cancelled: |
|
|
logger.warning("⚠️ Document processing cancelled during processing") |
|
|
try: |
|
|
loop = asyncio.new_event_loop() |
|
|
asyncio.set_event_loop(loop) |
|
|
loop.run_until_complete(force_close_browser()) |
|
|
loop.close() |
|
|
except Exception as e: |
|
|
logger.error(f"Error closing browser: {e}") |
|
|
return pd.DataFrame(), None, "🛑 Operation cancelled by user" |
|
|
|
|
|
|
|
|
from scraper_common import get_captcha_status |
|
|
captcha_status = get_captcha_status() |
|
|
if captcha_status: |
|
|
yield pd.DataFrame(), None, captcha_status |
|
|
time.sleep(0.5) |
|
|
continue |
|
|
|
|
|
|
|
|
current_time = time.time() |
|
|
if current_time - last_status_time >= 2.0: |
|
|
if status_step == 1: |
|
|
status_msg = "🔄 Step 2/4: Extracting documents from website..." |
|
|
yield pd.DataFrame(), None, status_msg |
|
|
status_step = 2 |
|
|
last_status_time = current_time |
|
|
elif status_step == 2: |
|
|
status_msg = "🤖 Step 3/4: Processing documents with AI models..." |
|
|
yield pd.DataFrame(), None, status_msg |
|
|
status_step = 3 |
|
|
last_status_time = current_time |
|
|
|
|
|
time.sleep(0.5) |
|
|
|
|
|
|
|
|
if result_container['error']: |
|
|
logger.error(f"❌ Error during document processing: {result_container['error']}") |
|
|
return pd.DataFrame(), None, f"❌ Error: {result_container['error']}" |
|
|
|
|
|
df = result_container['df'] |
|
|
|
|
|
|
|
|
if document_processing_cancelled: |
|
|
return pd.DataFrame(), None, "🛑 Operation cancelled by user" |
|
|
|
|
|
|
|
|
num_docs = len(df) if df is not None and not df.empty else 0 |
|
|
status_msg = f"💾 Step 4/4: Saving to archive... Found {num_docs} documents" |
|
|
yield pd.DataFrame(), None, status_msg |
|
|
|
|
|
if document_processing_cancelled: |
|
|
return pd.DataFrame(), None, "🛑 Operation cancelled by user" |
|
|
|
|
|
|
|
|
if not df.empty: |
|
|
try: |
|
|
source = url.split('/')[2].replace('www.', '') if '://' in url else 'unknown' |
|
|
archive_path = save_csv_to_archive(df, source, "document_content") |
|
|
logger.info(f"📁 Saved to archive: {archive_path}") |
|
|
except Exception as e: |
|
|
logger.error(f"❌ Error saving to archive: {str(e)}") |
|
|
|
|
|
csv_file = create_csv_download(df, "document_content") if not df.empty else None |
|
|
|
|
|
|
|
|
if document_processing_cancelled: |
|
|
return pd.DataFrame(), None, "🛑 Operation cancelled by user" |
|
|
|
|
|
|
|
|
logger.info(f"✅ Document processing complete! Found {len(df)} documents.") |
|
|
final_status = f"✅ Processing complete! Found {len(df)} documents." |
|
|
yield df, csv_file, final_status |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
return pd.DataFrame(), None, f"Error: {str(e)}" |
|
|
|
|
|
def cancel_document_processing(): |
|
|
"""Cancel the document processing operation""" |
|
|
logger.warning("⚠️ User requested cancellation of document processing") |
|
|
|
|
|
clear_memory_state() |
|
|
|
|
|
|
|
|
import threading |
|
|
def close_browser_async(): |
|
|
import asyncio |
|
|
try: |
|
|
logger.info("🔧 Attempting to close browser...") |
|
|
loop = asyncio.new_event_loop() |
|
|
asyncio.set_event_loop(loop) |
|
|
loop.run_until_complete(force_close_browser()) |
|
|
loop.close() |
|
|
logger.info("✅ Browser closed successfully") |
|
|
except Exception as e: |
|
|
logger.error(f"❌ Error closing browser: {e}") |
|
|
|
|
|
|
|
|
browser_close_thread = threading.Thread(target=close_browser_async) |
|
|
browser_close_thread.start() |
|
|
|
|
|
return "🛑 Document processing cancelled - stopping operation..." |
|
|
|
|
|
def clear_doc_all(): |
|
|
"""Clear URL input, date inputs, DataFrame, and download button for document content""" |
|
|
|
|
|
clear_memory_state() |
|
|
return "", "", "", pd.DataFrame(), None, "Ready to process documents..." |
|
|
|
|
|
process_btn.click( |
|
|
fn=process_and_display_docs, |
|
|
inputs=[doc_url_input, doc_start_date_input, doc_end_date_input], |
|
|
outputs=[doc_df, doc_download_btn, doc_status_text], |
|
|
show_progress=True |
|
|
) |
|
|
|
|
|
doc_cancel_btn.click( |
|
|
fn=cancel_document_processing, |
|
|
outputs=[doc_status_text] |
|
|
) |
|
|
|
|
|
doc_clear_btn.click( |
|
|
fn=clear_doc_all, |
|
|
outputs=[doc_url_input, doc_start_date_input, doc_end_date_input, doc_df, doc_download_btn, doc_status_text] |
|
|
) |
|
|
|
|
|
doc_df.change( |
|
|
fn=lambda df: gr.DownloadButton(visible=not df.empty), |
|
|
inputs=[doc_df], |
|
|
outputs=[doc_download_btn] |
|
|
) |
|
|
|
|
|
|
|
|
def create_archive_tab(): |
|
|
""" |
|
|
Create the archive access tab interface |
|
|
""" |
|
|
with gr.Tab("Archive Access"): |
|
|
gr.Markdown("## Archived Files Access") |
|
|
gr.Markdown("Browse, download, and manage previously processed files from the archive.") |
|
|
|
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(scale=1, elem_classes="admin-section"): |
|
|
gr.Markdown("### CSV Files") |
|
|
gr.Markdown("*Processed data files*") |
|
|
|
|
|
with gr.Row(): |
|
|
refresh_csv_btn = gr.Button("Refresh CSV", variant="secondary", size="sm") |
|
|
gr.Markdown("*Update CSV file list*") |
|
|
|
|
|
csv_df = gr.Dataframe( |
|
|
label="", |
|
|
headers=["Source", "Date", "Filename", "Path"], |
|
|
datatype=["str", "str", "str", "str"], |
|
|
interactive=True, |
|
|
wrap=False, |
|
|
elem_id="csv_dataframe" |
|
|
) |
|
|
|
|
|
csv_selection = gr.Dropdown( |
|
|
label="Select CSV File", |
|
|
choices=[], |
|
|
value=None, |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
open_csv_btn = gr.Button("Open", variant="secondary", size="sm") |
|
|
delete_csv_btn = gr.Button("Delete", variant="stop", size="sm") |
|
|
open_csv_folder_btn = gr.Button("Folder", variant="secondary", size="sm") |
|
|
|
|
|
|
|
|
with gr.Column(scale=1, elem_classes="admin-section"): |
|
|
gr.Markdown("### PDF Files") |
|
|
gr.Markdown("*Downloaded documents*") |
|
|
|
|
|
with gr.Row(): |
|
|
refresh_pdf_btn = gr.Button("Refresh PDF", variant="secondary", size="sm") |
|
|
gr.Markdown("*Update PDF file list*") |
|
|
|
|
|
pdf_df = gr.Dataframe( |
|
|
label="", |
|
|
headers=["Source", "Date", "Filename", "Size", "Path"], |
|
|
datatype=["str", "str", "str", "str", "str"], |
|
|
interactive=True, |
|
|
wrap=False, |
|
|
elem_id="pdf_dataframe" |
|
|
) |
|
|
|
|
|
pdf_selection = gr.Dropdown( |
|
|
label="Select PDF File", |
|
|
choices=[], |
|
|
value=None, |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
open_pdf_btn = gr.Button("Open", variant="secondary", size="sm") |
|
|
delete_pdf_btn = gr.Button("Delete", variant="stop", size="sm") |
|
|
open_pdf_folder_btn = gr.Button("Folder", variant="secondary", size="sm") |
|
|
|
|
|
|
|
|
|
|
|
with gr.Column(elem_classes="admin-section"): |
|
|
status_text = gr.Textbox( |
|
|
label="Status", |
|
|
interactive=False, |
|
|
value="Ready to access archived files...", |
|
|
lines=2 |
|
|
) |
|
|
|
|
|
|
|
|
def get_archived_csv_files(): |
|
|
"""Get list of archived CSV files""" |
|
|
archive_dir = ensure_archive_directory() |
|
|
csv_files = [] |
|
|
|
|
|
if os.path.exists(archive_dir): |
|
|
for source in os.listdir(archive_dir): |
|
|
source_path = os.path.join(archive_dir, source) |
|
|
if os.path.isdir(source_path): |
|
|
for date in os.listdir(source_path): |
|
|
date_path = os.path.join(source_path, date) |
|
|
if os.path.isdir(date_path): |
|
|
for file in os.listdir(date_path): |
|
|
if file.endswith('.csv'): |
|
|
file_path = os.path.join(date_path, file) |
|
|
file_size = os.path.getsize(file_path) |
|
|
csv_files.append({ |
|
|
'source': source, |
|
|
'date': date, |
|
|
'filename': file, |
|
|
'path': file_path, |
|
|
'size': f"{file_size / 1024:.2f} KB" |
|
|
}) |
|
|
|
|
|
return sorted(csv_files, key=lambda x: (x['source'], x['date'], x['filename']), reverse=True) |
|
|
|
|
|
def get_archived_pdf_files(): |
|
|
"""Get list of archived PDF files""" |
|
|
archive_dir = ensure_archive_directory() |
|
|
pdf_files = [] |
|
|
|
|
|
if os.path.exists(archive_dir): |
|
|
for source in os.listdir(archive_dir): |
|
|
source_path = os.path.join(archive_dir, source) |
|
|
if os.path.isdir(source_path): |
|
|
for date in os.listdir(source_path): |
|
|
date_path = os.path.join(source_path, date) |
|
|
if os.path.isdir(date_path): |
|
|
|
|
|
for file in os.listdir(date_path): |
|
|
if file.endswith('.pdf'): |
|
|
file_path = os.path.join(date_path, file) |
|
|
file_size = os.path.getsize(file_path) |
|
|
pdf_files.append({ |
|
|
'source': source, |
|
|
'date': date, |
|
|
'filename': file, |
|
|
'path': file_path, |
|
|
'size': f"{file_size / 1024 / 1024:.2f} MB" |
|
|
}) |
|
|
|
|
|
|
|
|
pdf_folder = os.path.join(date_path, "pdf") |
|
|
if os.path.exists(pdf_folder): |
|
|
for file in os.listdir(pdf_folder): |
|
|
if file.endswith('.pdf'): |
|
|
file_path = os.path.join(pdf_folder, file) |
|
|
file_size = os.path.getsize(file_path) |
|
|
pdf_files.append({ |
|
|
'source': source, |
|
|
'date': date, |
|
|
'filename': file, |
|
|
'path': file_path, |
|
|
'size': f"{file_size / 1024 / 1024:.2f} MB" |
|
|
}) |
|
|
|
|
|
return sorted(pdf_files, key=lambda x: (x['source'], x['date'], x['filename']), reverse=True) |
|
|
|
|
|
def refresh_csv_files(): |
|
|
"""Refresh CSV files list""" |
|
|
csv_files = get_archived_csv_files() |
|
|
if csv_files: |
|
|
display_data = [ |
|
|
{ |
|
|
'Source': item['source'], |
|
|
'Date': item['date'], |
|
|
'Filename': item['filename'], |
|
|
'Path': item['path'] |
|
|
} |
|
|
for item in csv_files |
|
|
] |
|
|
df = pd.DataFrame(display_data) |
|
|
choices = [f"{item['source']} | {item['date']} | {item['filename']}" for item in csv_files] |
|
|
default_choice = choices[0] if choices else None |
|
|
return df, f"Found {len(csv_files)} CSV files. Select a file below and click 'Open Selected CSV'.", gr.update(choices=choices, value=default_choice) |
|
|
else: |
|
|
return pd.DataFrame(), "No CSV files found in the archive.", gr.update(choices=[], value=None) |
|
|
|
|
|
def refresh_pdf_files(): |
|
|
"""Refresh PDF files list""" |
|
|
pdf_files = get_archived_pdf_files() |
|
|
if pdf_files: |
|
|
display_data = [ |
|
|
{ |
|
|
'Source': item['source'], |
|
|
'Date': item['date'], |
|
|
'Filename': item['filename'], |
|
|
'Size': item['size'], |
|
|
'Path': item['path'] |
|
|
} |
|
|
for item in pdf_files |
|
|
] |
|
|
df = pd.DataFrame(display_data) |
|
|
choices = [f"{item['source']} | {item['date']} | {item['filename']}" for item in pdf_files] |
|
|
default_choice = choices[0] if choices else None |
|
|
return df, f"Found {len(pdf_files)} PDF files. Select a file below and click 'Open Selected PDF'.", gr.update(choices=choices, value=default_choice) |
|
|
else: |
|
|
return pd.DataFrame(), "No PDF files found in the archive.", gr.update(choices=[], value=None) |
|
|
|
|
|
def open_selected_csv(selected_option): |
|
|
"""Open the selected CSV file""" |
|
|
try: |
|
|
if not selected_option: |
|
|
return "Please choose a CSV from the dropdown before clicking 'Open'." |
|
|
|
|
|
try: |
|
|
source, date, filename = [part.strip() for part in selected_option.split("|")] |
|
|
except ValueError: |
|
|
return "Invalid selection format. Please refresh the list and try again." |
|
|
|
|
|
for item in get_archived_csv_files(): |
|
|
if item['source'] == source and item['date'] == date and item['filename'] == filename: |
|
|
file_path = item['path'] |
|
|
if os.path.exists(file_path): |
|
|
return open_csv_file(file_path) |
|
|
return f"Cannot open file: {file_path}. File does not exist." |
|
|
|
|
|
return "Selected file not found. Please refresh the list." |
|
|
except Exception as e: |
|
|
return f"Error opening CSV file: {str(e)}" |
|
|
|
|
|
def open_selected_pdf(selected_option): |
|
|
"""Open the selected PDF file""" |
|
|
try: |
|
|
if not selected_option: |
|
|
return "Please choose a PDF from the dropdown before clicking 'Open'." |
|
|
|
|
|
try: |
|
|
source, date, filename = [part.strip() for part in selected_option.split("|")] |
|
|
except ValueError: |
|
|
return "Invalid selection format. Please refresh the list and try again." |
|
|
|
|
|
for item in get_archived_pdf_files(): |
|
|
if item['source'] == source and item['date'] == date and item['filename'] == filename: |
|
|
file_path = item['path'] |
|
|
if os.path.exists(file_path): |
|
|
return open_pdf_file(file_path) |
|
|
return f"Cannot open file: {file_path}. File does not exist." |
|
|
|
|
|
return "Selected file not found. Please refresh the list." |
|
|
except Exception as e: |
|
|
return f"Error opening PDF file: {str(e)}" |
|
|
|
|
|
def open_csv_file(file_path: str): |
|
|
"""Open a CSV file with the default application""" |
|
|
|
|
|
try: |
|
|
abs_path = os.path.abspath(file_path) |
|
|
|
|
|
|
|
|
if platform.system() == "Windows": |
|
|
subprocess.run(["start", "", abs_path], check=True, shell=True) |
|
|
elif platform.system() == "Darwin": |
|
|
subprocess.run(["open", abs_path], check=True) |
|
|
else: |
|
|
subprocess.run(["xdg-open", abs_path], check=True) |
|
|
|
|
|
return f"Opened CSV file: {abs_path}" |
|
|
except Exception as e: |
|
|
return f"Error opening CSV file: {str(e)}" |
|
|
|
|
|
def open_pdf_file(file_path: str): |
|
|
"""Open a PDF file with the default application""" |
|
|
|
|
|
try: |
|
|
abs_path = os.path.abspath(file_path) |
|
|
|
|
|
|
|
|
if platform.system() == "Windows": |
|
|
subprocess.run(["start", "", abs_path], check=True, shell=True) |
|
|
elif platform.system() == "Darwin": |
|
|
subprocess.run(["open", abs_path], check=True) |
|
|
else: |
|
|
subprocess.run(["xdg-open", abs_path], check=True) |
|
|
|
|
|
return f"Opened PDF file: {abs_path}" |
|
|
except Exception as e: |
|
|
return f"Error opening PDF file: {str(e)}" |
|
|
|
|
|
|
|
|
def delete_selected_csv(selected_option): |
|
|
"""Delete the selected CSV file""" |
|
|
try: |
|
|
if not selected_option: |
|
|
return "Please choose a CSV from the dropdown before clicking 'Delete'." |
|
|
|
|
|
try: |
|
|
source, date, filename = [part.strip() for part in selected_option.split("|")] |
|
|
except ValueError: |
|
|
return "Invalid selection format. Please refresh the list and try again." |
|
|
|
|
|
for item in get_archived_csv_files(): |
|
|
if item['source'] == source and item['date'] == date and item['filename'] == filename: |
|
|
file_path = item['path'] |
|
|
if os.path.exists(file_path): |
|
|
os.remove(file_path) |
|
|
return f"Successfully deleted CSV file: {filename}" |
|
|
return f"Cannot delete file: {file_path}. File does not exist." |
|
|
|
|
|
return "Selected file not found. Please refresh the list." |
|
|
except Exception as e: |
|
|
return f"Error deleting CSV file: {str(e)}" |
|
|
|
|
|
def delete_selected_pdf(selected_option): |
|
|
"""Delete the selected PDF file""" |
|
|
try: |
|
|
if not selected_option: |
|
|
return "Please choose a PDF from the dropdown before clicking 'Delete'." |
|
|
|
|
|
try: |
|
|
source, date, filename = [part.strip() for part in selected_option.split("|")] |
|
|
except ValueError: |
|
|
return "Invalid selection format. Please refresh the list and try again." |
|
|
|
|
|
for item in get_archived_pdf_files(): |
|
|
if item['source'] == source and item['date'] == date and item['filename'] == filename: |
|
|
file_path = item['path'] |
|
|
if os.path.exists(file_path): |
|
|
os.remove(file_path) |
|
|
return f"Successfully deleted PDF file: {filename}" |
|
|
return f"Cannot delete file: {file_path}. File does not exist." |
|
|
|
|
|
return "Selected file not found. Please refresh the list." |
|
|
except Exception as e: |
|
|
return f"Error deleting PDF file: {str(e)}" |
|
|
|
|
|
def open_csv_folder(): |
|
|
"""Open the CSV archive folder""" |
|
|
|
|
|
archive_dir = os.path.abspath("archive") |
|
|
|
|
|
try: |
|
|
if platform.system() == "Windows": |
|
|
subprocess.run(["explorer", archive_dir], check=True) |
|
|
elif platform.system() == "Darwin": |
|
|
subprocess.run(["open", archive_dir], check=True) |
|
|
else: |
|
|
subprocess.run(["xdg-open", archive_dir], check=True) |
|
|
|
|
|
return f"Opened archive folder: {archive_dir}" |
|
|
except Exception as e: |
|
|
return f"Error opening folder: {str(e)}" |
|
|
|
|
|
def open_pdf_folder(): |
|
|
"""Open the PDF archive folder""" |
|
|
|
|
|
archive_dir = os.path.abspath("archive") |
|
|
|
|
|
try: |
|
|
if platform.system() == "Windows": |
|
|
subprocess.run(["explorer", archive_dir], check=True) |
|
|
elif platform.system() == "Darwin": |
|
|
subprocess.run(["open", archive_dir], check=True) |
|
|
else: |
|
|
subprocess.run(["xdg-open", archive_dir], check=True) |
|
|
|
|
|
return f"Opened archive folder: {archive_dir}" |
|
|
except Exception as e: |
|
|
return f"Error opening folder: {str(e)}" |
|
|
|
|
|
refresh_csv_btn.click( |
|
|
fn=refresh_csv_files, |
|
|
outputs=[csv_df, status_text, csv_selection] |
|
|
) |
|
|
|
|
|
refresh_pdf_btn.click( |
|
|
fn=refresh_pdf_files, |
|
|
outputs=[pdf_df, status_text, pdf_selection] |
|
|
) |
|
|
|
|
|
open_csv_btn.click( |
|
|
fn=open_selected_csv, |
|
|
inputs=[csv_selection], |
|
|
outputs=[status_text] |
|
|
) |
|
|
|
|
|
open_pdf_btn.click( |
|
|
fn=open_selected_pdf, |
|
|
inputs=[pdf_selection], |
|
|
outputs=[status_text] |
|
|
) |
|
|
|
|
|
|
|
|
delete_csv_btn.click( |
|
|
fn=delete_selected_csv, |
|
|
inputs=[csv_selection], |
|
|
outputs=[status_text] |
|
|
) |
|
|
|
|
|
delete_pdf_btn.click( |
|
|
fn=delete_selected_pdf, |
|
|
inputs=[pdf_selection], |
|
|
outputs=[status_text] |
|
|
) |
|
|
|
|
|
open_csv_folder_btn.click( |
|
|
fn=open_csv_folder, |
|
|
outputs=[status_text] |
|
|
) |
|
|
|
|
|
open_pdf_folder_btn.click( |
|
|
fn=open_pdf_folder, |
|
|
outputs=[status_text] |
|
|
) |
|
|
|
|
|
|
|
|
def create_keywords_management_tab(): |
|
|
""" |
|
|
Create the keywords management tab interface |
|
|
""" |
|
|
with gr.Tab("Keywords Management"): |
|
|
|
|
|
gr.Markdown("## Keywords Configuration") |
|
|
gr.Markdown("Manage keyword categories for intelligent article filtering and categorization.") |
|
|
|
|
|
|
|
|
def load_keywords_config(): |
|
|
"""Load current keywords configuration""" |
|
|
try: |
|
|
from keyword_filter import load_keywords_config |
|
|
categories = load_keywords_config() |
|
|
return categories if categories else {} |
|
|
except Exception as e: |
|
|
logger.error(f"Error loading keywords config: {str(e)}") |
|
|
return {} |
|
|
|
|
|
def get_category_list(): |
|
|
"""Get list of categories for dropdown""" |
|
|
categories = load_keywords_config() |
|
|
return list(categories.keys()) if categories else [] |
|
|
|
|
|
def get_keywords_for_category(category): |
|
|
"""Get keywords for a specific category""" |
|
|
categories = load_keywords_config() |
|
|
if category and category in categories: |
|
|
return ", ".join(categories[category]) |
|
|
return "" |
|
|
|
|
|
def add_new_category(category_name, keywords_text): |
|
|
"""Add a new category with keywords""" |
|
|
try: |
|
|
from keyword_filter import load_keywords_config, save_keywords_config |
|
|
|
|
|
if not category_name.strip(): |
|
|
return "❌ Category name cannot be empty", gr.update(), gr.update() |
|
|
|
|
|
|
|
|
categories = load_keywords_config() |
|
|
if not categories: |
|
|
categories = {} |
|
|
|
|
|
|
|
|
keywords = [kw.strip() for kw in keywords_text.split(",") if kw.strip()] |
|
|
|
|
|
if not keywords: |
|
|
return "❌ Please provide at least one keyword", gr.update(), gr.update() |
|
|
|
|
|
|
|
|
categories[category_name.strip()] = keywords |
|
|
|
|
|
|
|
|
config_data = {"categories": categories} |
|
|
success, message = save_keywords_config(config_data) |
|
|
|
|
|
if success: |
|
|
return f"✅ {message}", gr.update(choices=get_category_list(), value=category_name.strip()), gr.update() |
|
|
else: |
|
|
return f"❌ {message}", gr.update(), gr.update() |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error adding category: {str(e)}") |
|
|
return f"❌ Error adding category: {str(e)}", gr.update(), gr.update() |
|
|
|
|
|
def update_category_keywords(category, keywords_text): |
|
|
"""Update keywords for a category""" |
|
|
try: |
|
|
from keyword_filter import load_keywords_config, save_keywords_config |
|
|
|
|
|
if not category: |
|
|
return "❌ Please select a category", gr.update() |
|
|
|
|
|
|
|
|
categories = load_keywords_config() |
|
|
if not categories: |
|
|
return "❌ No categories found", gr.update() |
|
|
|
|
|
|
|
|
keywords = [kw.strip() for kw in keywords_text.split(",") if kw.strip()] |
|
|
|
|
|
if not keywords: |
|
|
return "❌ Please provide at least one keyword", gr.update() |
|
|
|
|
|
|
|
|
categories[category] = keywords |
|
|
|
|
|
|
|
|
config_data = {"categories": categories} |
|
|
success, message = save_keywords_config(config_data) |
|
|
|
|
|
if success: |
|
|
return f"✅ {message}" |
|
|
else: |
|
|
return f"❌ {message}" |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error updating category: {str(e)}") |
|
|
return f"❌ Error updating category: {str(e)}" |
|
|
|
|
|
def delete_category(category): |
|
|
"""Delete a category""" |
|
|
try: |
|
|
from keyword_filter import load_keywords_config, save_keywords_config |
|
|
|
|
|
if not category: |
|
|
return "❌ Please select a category to delete", gr.update(), gr.update() |
|
|
|
|
|
|
|
|
categories = load_keywords_config() |
|
|
if not categories: |
|
|
return "❌ No categories found", gr.update(), gr.update() |
|
|
|
|
|
|
|
|
if category in categories: |
|
|
del categories[category] |
|
|
|
|
|
|
|
|
config_data = {"categories": categories} |
|
|
success, message = save_keywords_config(config_data) |
|
|
|
|
|
if success: |
|
|
new_choices = get_category_list() |
|
|
return f"✅ Category '{category}' deleted successfully", gr.update(choices=new_choices, value=None), gr.update() |
|
|
else: |
|
|
return f"❌ {message}", gr.update(), gr.update() |
|
|
else: |
|
|
return f"❌ Category '{category}' not found", gr.update(), gr.update() |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error deleting category: {str(e)}") |
|
|
return f"❌ Error deleting category: {str(e)}", gr.update(), gr.update() |
|
|
|
|
|
|
|
|
initial_categories = get_category_list() |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
with gr.Group(): |
|
|
gr.Markdown("### Add New Category") |
|
|
gr.Markdown("*Create a new keyword category for article filtering*") |
|
|
|
|
|
new_category_name = gr.Textbox( |
|
|
label="Category Name", |
|
|
placeholder="e.g., Health / Epidemics", |
|
|
interactive=True, |
|
|
info="Enter a descriptive name for the category" |
|
|
) |
|
|
|
|
|
new_category_keywords = gr.Textbox( |
|
|
label="Keywords (comma-separated)", |
|
|
placeholder="e.g., cholera, malaria, covid, outbreak", |
|
|
lines=4, |
|
|
interactive=True, |
|
|
info="Enter keywords separated by commas." |
|
|
) |
|
|
|
|
|
add_category_btn = gr.Button("Add Category", variant="primary", size="lg") |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
with gr.Group(): |
|
|
gr.Markdown("### Edit Existing Category") |
|
|
gr.Markdown("*Modify or delete existing keyword categories*") |
|
|
|
|
|
category_dropdown = gr.Dropdown( |
|
|
label="Select Category", |
|
|
choices=initial_categories, |
|
|
interactive=True, |
|
|
value=initial_categories[0] if initial_categories else None, |
|
|
info="Choose a category to edit or delete" |
|
|
) |
|
|
|
|
|
category_keywords = gr.Textbox( |
|
|
label="Keywords (comma-separated)", |
|
|
placeholder="Enter keywords separated by commas", |
|
|
lines=4, |
|
|
interactive=True, |
|
|
info="Edit the keywords for the selected category" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
update_btn = gr.Button("Update Keywords", variant="primary") |
|
|
delete_btn = gr.Button("Delete Category", variant="stop") |
|
|
|
|
|
|
|
|
gr.Markdown("---") |
|
|
status_display = gr.Textbox( |
|
|
label="Status", |
|
|
value="Ready to manage keywords...", |
|
|
interactive=False, |
|
|
visible=True, |
|
|
info="Status messages will appear here" |
|
|
) |
|
|
|
|
|
|
|
|
add_category_btn.click( |
|
|
fn=add_new_category, |
|
|
inputs=[new_category_name, new_category_keywords], |
|
|
outputs=[status_display, category_dropdown, category_keywords] |
|
|
) |
|
|
|
|
|
category_dropdown.change( |
|
|
fn=get_keywords_for_category, |
|
|
inputs=[category_dropdown], |
|
|
outputs=[category_keywords] |
|
|
) |
|
|
|
|
|
update_btn.click( |
|
|
fn=update_category_keywords, |
|
|
inputs=[category_dropdown, category_keywords], |
|
|
outputs=[status_display] |
|
|
) |
|
|
|
|
|
delete_btn.click( |
|
|
fn=delete_category, |
|
|
inputs=[category_dropdown], |
|
|
outputs=[status_display, category_dropdown, category_keywords] |
|
|
) |
|
|
|
|
|
|
|
|
def create_admin_tab(): |
|
|
""" |
|
|
Create the admin panel tab interface |
|
|
""" |
|
|
with gr.Tab("Admin Panel") as admin_tab: |
|
|
gr.Markdown("## Admin Panel") |
|
|
gr.Markdown("Manage user accounts, permissions, and system settings.") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(scale=1, elem_classes="admin-section", visible=True) as admin_user_section: |
|
|
with gr.Group(elem_classes="admin-group"): |
|
|
gr.Markdown("### Add New User") |
|
|
gr.Markdown("*Create new user accounts*") |
|
|
|
|
|
new_username = gr.Textbox( |
|
|
label="Username", |
|
|
placeholder="Enter username", |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
new_password = gr.Textbox( |
|
|
label="Password", |
|
|
placeholder="Enter password", |
|
|
type="password", |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
is_admin = gr.Checkbox( |
|
|
label="Grant admin privileges", |
|
|
value=False, |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
add_user_btn = gr.Button("Add User", variant="primary", size="sm", elem_classes="admin-button") |
|
|
|
|
|
|
|
|
with gr.Column(scale=1, elem_classes="admin-section"): |
|
|
with gr.Group(elem_classes="admin-group"): |
|
|
gr.Markdown("### Change Password") |
|
|
gr.Markdown("*Update your account password*") |
|
|
|
|
|
change_old_password = gr.Textbox( |
|
|
label="Current Password", |
|
|
placeholder="Enter current password", |
|
|
type="password", |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
change_new_password = gr.Textbox( |
|
|
label="New Password", |
|
|
placeholder="Enter new password", |
|
|
type="password", |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
change_password_btn = gr.Button("Change Password", variant="secondary", size="sm", elem_classes="admin-button") |
|
|
|
|
|
|
|
|
with gr.Column(elem_classes="admin-section"): |
|
|
with gr.Group(elem_classes="admin-group"): |
|
|
gr.Markdown("### System Users") |
|
|
gr.Markdown("*View all registered users*") |
|
|
|
|
|
with gr.Row(): |
|
|
refresh_users_btn = gr.Button("Refresh", variant="secondary", size="sm", elem_classes="admin-button") |
|
|
|
|
|
users_df = gr.Dataframe( |
|
|
label="", |
|
|
headers=["Username", "Admin", "Created", "Last Login"], |
|
|
datatype=["str", "str", "str", "str"], |
|
|
interactive=False, |
|
|
wrap=True |
|
|
) |
|
|
|
|
|
|
|
|
admin_status = gr.Textbox( |
|
|
label="Status", |
|
|
value="Ready - Use the controls above to manage users", |
|
|
interactive=False, |
|
|
lines=2 |
|
|
) |
|
|
|
|
|
def handle_add_user(username, password, admin_check): |
|
|
"""Handle adding new user (admin only)""" |
|
|
if not is_authenticated() or not auth_manager.is_admin(get_current_user()): |
|
|
return "❌ Access denied - Admin privileges required", pd.DataFrame(), gr.update(value=""), gr.update(value=""), gr.update(value=False), gr.update(visible=False) |
|
|
|
|
|
if not username or not password: |
|
|
return "❌ Please enter both username and password", pd.DataFrame(), gr.update(value=""), gr.update(value=""), gr.update(value=False), gr.update(visible=True) |
|
|
|
|
|
success = auth_manager.add_user(username, password, admin_check) |
|
|
if success: |
|
|
|
|
|
users = auth_manager.list_users() |
|
|
user_data = [] |
|
|
for user, info in users.items(): |
|
|
user_data.append({ |
|
|
'Username': user, |
|
|
'Admin': 'Yes' if info.get('is_admin', False) else 'No', |
|
|
'Created': info.get('created_at', 'Unknown'), |
|
|
'Last Login': info.get('last_login', 'Never') |
|
|
}) |
|
|
df = pd.DataFrame(user_data) |
|
|
return f"✅ User '{username}' added successfully", df, gr.update(value=""), gr.update(value=""), gr.update(value=False), gr.update(visible=True) |
|
|
else: |
|
|
return f"❌ Failed to add user '{username}' (user may already exist)", pd.DataFrame(), gr.update(value=""), gr.update(value=""), gr.update(value=False), gr.update(visible=True) |
|
|
|
|
|
def handle_change_password(old_password, new_password): |
|
|
"""Handle password change""" |
|
|
if not is_authenticated(): |
|
|
return "❌ Please login first", gr.update(value=""), gr.update(value="") |
|
|
|
|
|
if not old_password or not new_password: |
|
|
return "❌ Please enter both current and new password", gr.update(value=""), gr.update(value="") |
|
|
|
|
|
success = auth_manager.change_password(get_current_user(), old_password, new_password) |
|
|
if success: |
|
|
return "✅ Password changed successfully", gr.update(value=""), gr.update(value="") |
|
|
else: |
|
|
return "❌ Failed to change password (check current password)", gr.update(value=""), gr.update(value="") |
|
|
|
|
|
def refresh_users(): |
|
|
"""Refresh the user list""" |
|
|
if not is_authenticated() or not auth_manager.is_admin(get_current_user()): |
|
|
return pd.DataFrame(), "❌ Access denied - Admin privileges required" |
|
|
|
|
|
users = auth_manager.list_users() |
|
|
user_data = [] |
|
|
for user, info in users.items(): |
|
|
user_data.append({ |
|
|
'Username': user, |
|
|
'Admin': 'Yes' if info.get('is_admin', False) else 'No', |
|
|
'Created': info.get('created_at', 'Unknown'), |
|
|
'Last Login': info.get('last_login', 'Never') |
|
|
}) |
|
|
df = pd.DataFrame(user_data) |
|
|
return df, f"✅ User list refreshed - {len(users)} users found" |
|
|
|
|
|
def refresh_users_and_check_admin(): |
|
|
"""Refresh users and check admin status""" |
|
|
|
|
|
is_admin = is_authenticated() and auth_manager.is_admin(get_current_user()) |
|
|
|
|
|
|
|
|
if is_admin: |
|
|
users = auth_manager.list_users() |
|
|
user_data = [] |
|
|
for user, info in users.items(): |
|
|
user_data.append({ |
|
|
'Username': user, |
|
|
'Admin': 'Yes' if info.get('is_admin', False) else 'No', |
|
|
'Created': info.get('created_at', 'Unknown'), |
|
|
'Last Login': info.get('last_login', 'Never') |
|
|
}) |
|
|
df = pd.DataFrame(user_data) |
|
|
return df, f"✅ User list refreshed - {len(users)} users found", gr.update(visible=True) |
|
|
else: |
|
|
return pd.DataFrame(), "❌ Access denied - Admin privileges required", gr.update(visible=False) |
|
|
|
|
|
def check_admin_status(): |
|
|
"""Check if current user is admin and show/hide admin user section""" |
|
|
if is_authenticated() and auth_manager.is_admin(get_current_user()): |
|
|
return gr.update(visible=True) |
|
|
else: |
|
|
return gr.update(visible=False) |
|
|
|
|
|
def initialize_admin_panel(): |
|
|
"""Initialize admin panel when tab loads""" |
|
|
if is_authenticated() and auth_manager.is_admin(get_current_user()): |
|
|
|
|
|
users = auth_manager.list_users() |
|
|
user_data = [] |
|
|
for user, info in users.items(): |
|
|
user_data.append({ |
|
|
'Username': user, |
|
|
'Admin': 'Yes' if info.get('is_admin', False) else 'No', |
|
|
'Created': info.get('created_at', 'Unknown'), |
|
|
'Last Login': info.get('last_login', 'Never') |
|
|
}) |
|
|
df = pd.DataFrame(user_data) |
|
|
return df, f"✅ Admin panel loaded - {len(users)} users found", gr.update(visible=True) |
|
|
else: |
|
|
return pd.DataFrame(), "❌ Access denied - Admin privileges required", gr.update(visible=False) |
|
|
|
|
|
|
|
|
add_user_btn.click( |
|
|
fn=handle_add_user, |
|
|
inputs=[new_username, new_password, is_admin], |
|
|
outputs=[admin_status, users_df, new_username, new_password, is_admin, admin_user_section] |
|
|
) |
|
|
|
|
|
change_password_btn.click( |
|
|
fn=handle_change_password, |
|
|
inputs=[change_old_password, change_new_password], |
|
|
outputs=[admin_status, change_old_password, change_new_password] |
|
|
) |
|
|
|
|
|
refresh_users_btn.click( |
|
|
fn=refresh_users_and_check_admin, |
|
|
outputs=[users_df, admin_status, admin_user_section] |
|
|
) |
|
|
|
|
|
|
|
|
admin_tab.select( |
|
|
fn=initialize_admin_panel, |
|
|
outputs=[users_df, admin_status, admin_user_section] |
|
|
) |
|
|
|
|
|
|
|
|
def create_website_config_tab(): |
|
|
""" |
|
|
Create the website configuration management tab interface |
|
|
""" |
|
|
with gr.Tab("Website Config"): |
|
|
|
|
|
gr.Markdown("## Website Configuration Management") |
|
|
gr.Markdown("Configure and manage CSS selectors for website scraping. Customize how content is extracted.") |
|
|
|
|
|
|
|
|
def load_website_config(): |
|
|
"""Load current website configuration""" |
|
|
try: |
|
|
from scraper_common import load_website_config |
|
|
config = load_website_config() |
|
|
return config if config else {} |
|
|
except Exception as e: |
|
|
logger.error(f"Error loading website config: {str(e)}") |
|
|
return {} |
|
|
|
|
|
def get_website_list(): |
|
|
"""Get list of website types for dropdown""" |
|
|
config = load_website_config() |
|
|
return list(config.keys()) if config else [] |
|
|
|
|
|
def get_config_for_website(website_type, current_state=None): |
|
|
"""Get configuration for a specific website""" |
|
|
|
|
|
config = current_state if current_state else load_website_config() |
|
|
if website_type and website_type in config: |
|
|
website_config = config[website_type] |
|
|
|
|
|
|
|
|
pdf_links = website_config.get('pdf_links', []) |
|
|
if isinstance(pdf_links, list): |
|
|
pdf_links_str = ", ".join(pdf_links) if pdf_links else "" |
|
|
else: |
|
|
pdf_links_str = str(pdf_links) if pdf_links else "" |
|
|
|
|
|
file_links = website_config.get('file_links', []) |
|
|
if isinstance(file_links, list): |
|
|
file_links_str = ", ".join(file_links) if file_links else "" |
|
|
else: |
|
|
file_links_str = str(file_links) if file_links else "" |
|
|
|
|
|
|
|
|
content = website_config.get('content', '') |
|
|
if isinstance(content, list): |
|
|
content_str = ", ".join(content) if content else "" |
|
|
else: |
|
|
content_str = str(content) if content else "" |
|
|
|
|
|
return ( |
|
|
website_config.get('base_url', '') or '', |
|
|
website_config.get('article_links', '') or '', |
|
|
website_config.get('page_links', '') or '', |
|
|
website_config.get('title', '') or '', |
|
|
content_str, |
|
|
website_config.get('date', '') or '', |
|
|
website_config.get('navigation_selector', '') or '', |
|
|
website_config.get('navigation_url_addition', '') or '', |
|
|
str(website_config.get('start_page', 0)) if website_config.get('start_page') is not None else '0', |
|
|
pdf_links_str, |
|
|
file_links_str, |
|
|
website_config.get('recaptcha_text', '') or '' |
|
|
) |
|
|
return ('', '', '', '', '', '', '', '', '0', '', '', '') |
|
|
|
|
|
|
|
|
initial_websites = get_website_list() |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
with gr.Group(): |
|
|
gr.Markdown("### Select Website") |
|
|
gr.Markdown("*Choose a website to edit or delete*") |
|
|
|
|
|
website_dropdown = gr.Dropdown( |
|
|
label="Website Type", |
|
|
choices=initial_websites, |
|
|
interactive=True, |
|
|
value=initial_websites[0] if initial_websites else None, |
|
|
info="Select a website configuration to edit" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
delete_website_btn = gr.Button("Delete Website", variant="stop") |
|
|
|
|
|
with gr.Group(): |
|
|
gr.Markdown("### Add New Website") |
|
|
gr.Markdown("*Create a new website configuration*") |
|
|
|
|
|
new_website_type = gr.Textbox( |
|
|
label="Website Type Name", |
|
|
placeholder="e.g., newsite", |
|
|
interactive=True, |
|
|
info="Enter a unique identifier (no spaces)" |
|
|
) |
|
|
|
|
|
add_website_btn = gr.Button("Add New Website", variant="primary") |
|
|
|
|
|
|
|
|
with gr.Column(scale=2): |
|
|
gr.Markdown("### Configuration Fields") |
|
|
gr.Markdown("*Edit the configuration fields below*") |
|
|
|
|
|
|
|
|
with gr.Group(): |
|
|
gr.Markdown("**Required Fields**") |
|
|
base_url_field = gr.Textbox( |
|
|
label="Base URL", |
|
|
placeholder="e.g., https://example.com", |
|
|
interactive=True, |
|
|
info="Base URL of the website (required)" |
|
|
) |
|
|
title_field = gr.Textbox( |
|
|
label="Title Selector", |
|
|
placeholder="e.g., h1, .title, #article-title", |
|
|
interactive=True, |
|
|
info="CSS selector for article title (required)" |
|
|
) |
|
|
|
|
|
content_field = gr.Textbox( |
|
|
label="Content Selector", |
|
|
placeholder="e.g., .content, p, #main-body", |
|
|
interactive=True, |
|
|
lines=2, |
|
|
info="CSS selector for article content (required). For multiple selectors, use comma-separated values." |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Group(): |
|
|
gr.Markdown("**Optional Fields**") |
|
|
|
|
|
article_links_field = gr.Textbox( |
|
|
label="Article Links Selector", |
|
|
placeholder="e.g., .article-link a, h2 a", |
|
|
interactive=True, |
|
|
info="CSS selector for article links on listing pages" |
|
|
) |
|
|
|
|
|
page_links_field = gr.Textbox( |
|
|
label="Page Links Selector", |
|
|
placeholder="e.g., .page-link a", |
|
|
interactive=True, |
|
|
info="CSS selector for page links (for document sites)" |
|
|
) |
|
|
|
|
|
date_field = gr.Textbox( |
|
|
label="Date Selector", |
|
|
placeholder="e.g., .date, time, .published", |
|
|
interactive=True, |
|
|
info="CSS selector for publication date" |
|
|
) |
|
|
|
|
|
navigation_selector_field = gr.Textbox( |
|
|
label="Navigation Selector", |
|
|
placeholder="e.g., .pagination, .nav-links", |
|
|
interactive=True, |
|
|
info="CSS selector for pagination navigation" |
|
|
) |
|
|
|
|
|
navigation_url_addition_field = gr.Textbox( |
|
|
label="Navigation URL Addition", |
|
|
placeholder="e.g., ?page={page_no}, /page/{page_no}/", |
|
|
interactive=True, |
|
|
info="URL pattern for pagination (use {page_no} as placeholder)" |
|
|
) |
|
|
|
|
|
start_page_field = gr.Textbox( |
|
|
label="Start Page", |
|
|
placeholder="0 or 1", |
|
|
interactive=True, |
|
|
value="0", |
|
|
info="Starting page number (0 or 1)" |
|
|
) |
|
|
|
|
|
pdf_links_field = gr.Textbox( |
|
|
label="PDF Links Selectors", |
|
|
placeholder="e.g., a[href$='.pdf'], .pdf-link", |
|
|
interactive=True, |
|
|
lines=2, |
|
|
info="CSS selectors for PDF links (comma-separated for multiple)" |
|
|
) |
|
|
|
|
|
file_links_field = gr.Textbox( |
|
|
label="File Links Selectors", |
|
|
placeholder="e.g., a[href$='.csv'], .file-link", |
|
|
interactive=True, |
|
|
lines=2, |
|
|
info="CSS selectors for file links (comma-separated for multiple)" |
|
|
) |
|
|
|
|
|
recaptcha_text_field = gr.Textbox( |
|
|
label="Recaptcha Text", |
|
|
placeholder="e.g., Let's confirm you are human", |
|
|
interactive=True, |
|
|
info="Text to look for when recaptcha is present" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
update_website_btn = gr.Button("Update Website", variant="primary") |
|
|
save_all_btn = gr.Button("Save All Changes", variant="primary") |
|
|
cancel_btn = gr.Button("Cancel", variant="secondary") |
|
|
|
|
|
|
|
|
unsaved_config_state = gr.State(value={}) |
|
|
|
|
|
def add_new_website(website_type, base_url, article_links, page_links, title, content, date, |
|
|
navigation_selector, navigation_url_addition, start_page, |
|
|
pdf_links, file_links, recaptcha_text, current_state): |
|
|
"""Add a new website configuration""" |
|
|
try: |
|
|
if not website_type or not website_type.strip(): |
|
|
return gr.update(), gr.update() |
|
|
|
|
|
website_type = website_type.strip() |
|
|
|
|
|
|
|
|
if ' ' in website_type: |
|
|
return gr.update(), gr.update() |
|
|
|
|
|
|
|
|
config = current_state if current_state else load_website_config() |
|
|
if not config: |
|
|
config = {} |
|
|
|
|
|
|
|
|
if website_type in config: |
|
|
return gr.update(), gr.update() |
|
|
|
|
|
|
|
|
if not title and not content: |
|
|
return gr.update(), gr.update() |
|
|
if not base_url or not base_url.strip(): |
|
|
return gr.update(), gr.update() |
|
|
|
|
|
|
|
|
new_config = {} |
|
|
|
|
|
|
|
|
new_config['base_url'] = base_url.strip() |
|
|
|
|
|
|
|
|
if article_links.strip(): |
|
|
new_config['article_links'] = article_links.strip() |
|
|
if page_links.strip(): |
|
|
new_config['page_links'] = page_links.strip() |
|
|
if title.strip(): |
|
|
new_config['title'] = title.strip() |
|
|
if content.strip(): |
|
|
|
|
|
content_vals = [c.strip() for c in content.split(',') if c.strip()] |
|
|
if len(content_vals) > 1: |
|
|
new_config['content'] = content_vals |
|
|
else: |
|
|
new_config['content'] = content.strip() |
|
|
if date.strip(): |
|
|
new_config['date'] = date.strip() |
|
|
if navigation_selector.strip(): |
|
|
new_config['navigation_selector'] = navigation_selector.strip() |
|
|
else: |
|
|
new_config['navigation_selector'] = None |
|
|
if navigation_url_addition.strip(): |
|
|
new_config['navigation_url_addition'] = navigation_url_addition.strip() |
|
|
else: |
|
|
new_config['navigation_url_addition'] = None |
|
|
if start_page.strip(): |
|
|
try: |
|
|
new_config['start_page'] = int(start_page.strip()) |
|
|
except ValueError: |
|
|
return gr.update(), gr.update() |
|
|
else: |
|
|
new_config['start_page'] = 0 |
|
|
|
|
|
|
|
|
if pdf_links.strip(): |
|
|
pdf_list = [p.strip() for p in pdf_links.split(',') if p.strip()] |
|
|
new_config['pdf_links'] = pdf_list |
|
|
if file_links.strip(): |
|
|
file_list = [f.strip() for f in file_links.split(',') if f.strip()] |
|
|
new_config['file_links'] = file_list |
|
|
if recaptcha_text.strip(): |
|
|
new_config['recaptcha_text'] = recaptcha_text.strip() |
|
|
|
|
|
|
|
|
config[website_type] = new_config |
|
|
|
|
|
|
|
|
website_list = list(config.keys()) |
|
|
return (gr.update(choices=website_list, value=website_type), |
|
|
config) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error adding website: {str(e)}") |
|
|
return gr.update(), gr.update() |
|
|
|
|
|
def update_website(website_type, base_url, article_links, page_links, title, content, date, |
|
|
navigation_selector, navigation_url_addition, start_page, |
|
|
pdf_links, file_links, recaptcha_text, current_state): |
|
|
"""Update an existing website configuration""" |
|
|
try: |
|
|
if not website_type: |
|
|
return gr.update() |
|
|
|
|
|
|
|
|
config = current_state if current_state else load_website_config() |
|
|
if not config: |
|
|
config = {} |
|
|
|
|
|
if website_type not in config: |
|
|
return gr.update() |
|
|
|
|
|
|
|
|
if not title and not content: |
|
|
return gr.update() |
|
|
if not base_url or not base_url.strip(): |
|
|
return gr.update() |
|
|
|
|
|
|
|
|
existing_config = config.get(website_type, {}) |
|
|
updated_config = existing_config.copy() |
|
|
|
|
|
|
|
|
updated_config['base_url'] = base_url.strip() |
|
|
|
|
|
|
|
|
if article_links.strip(): |
|
|
updated_config['article_links'] = article_links.strip() |
|
|
elif 'article_links' in updated_config: |
|
|
del updated_config['article_links'] |
|
|
|
|
|
if page_links.strip(): |
|
|
updated_config['page_links'] = page_links.strip() |
|
|
elif 'page_links' in updated_config: |
|
|
del updated_config['page_links'] |
|
|
|
|
|
if title.strip(): |
|
|
updated_config['title'] = title.strip() |
|
|
if content.strip(): |
|
|
|
|
|
content_vals = [c.strip() for c in content.split(',') if c.strip()] |
|
|
if len(content_vals) > 1: |
|
|
updated_config['content'] = content_vals |
|
|
else: |
|
|
updated_config['content'] = content.strip() |
|
|
|
|
|
if date.strip(): |
|
|
updated_config['date'] = date.strip() |
|
|
elif 'date' in updated_config: |
|
|
del updated_config['date'] |
|
|
|
|
|
if navigation_selector.strip(): |
|
|
updated_config['navigation_selector'] = navigation_selector.strip() |
|
|
else: |
|
|
updated_config['navigation_selector'] = None |
|
|
|
|
|
if navigation_url_addition.strip(): |
|
|
updated_config['navigation_url_addition'] = navigation_url_addition.strip() |
|
|
else: |
|
|
updated_config['navigation_url_addition'] = None |
|
|
|
|
|
if start_page.strip(): |
|
|
try: |
|
|
updated_config['start_page'] = int(start_page.strip()) |
|
|
except ValueError: |
|
|
return gr.update() |
|
|
else: |
|
|
updated_config['start_page'] = 0 |
|
|
|
|
|
|
|
|
if pdf_links.strip(): |
|
|
pdf_list = [p.strip() for p in pdf_links.split(',') if p.strip()] |
|
|
updated_config['pdf_links'] = pdf_list |
|
|
elif 'pdf_links' in updated_config: |
|
|
del updated_config['pdf_links'] |
|
|
|
|
|
if file_links.strip(): |
|
|
file_list = [f.strip() for f in file_links.split(',') if f.strip()] |
|
|
updated_config['file_links'] = file_list |
|
|
elif 'file_links' in updated_config: |
|
|
del updated_config['file_links'] |
|
|
|
|
|
if recaptcha_text.strip(): |
|
|
updated_config['recaptcha_text'] = recaptcha_text.strip() |
|
|
elif 'recaptcha_text' in updated_config: |
|
|
del updated_config['recaptcha_text'] |
|
|
|
|
|
|
|
|
config[website_type] = updated_config |
|
|
|
|
|
return config |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error updating website: {str(e)}") |
|
|
return gr.update() |
|
|
|
|
|
def delete_website(website_type, current_state): |
|
|
"""Delete a website configuration""" |
|
|
try: |
|
|
if not website_type: |
|
|
return gr.update(), gr.update() |
|
|
|
|
|
|
|
|
config = current_state if current_state else load_website_config() |
|
|
if not config: |
|
|
return gr.update(), gr.update() |
|
|
|
|
|
if website_type not in config: |
|
|
return gr.update(), gr.update() |
|
|
|
|
|
|
|
|
del config[website_type] |
|
|
|
|
|
|
|
|
website_list = list(config.keys()) |
|
|
|
|
|
return (gr.update(choices=website_list, value=website_list[0] if website_list else None), |
|
|
config) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error deleting website: {str(e)}") |
|
|
return gr.update(), gr.update() |
|
|
|
|
|
def save_all_changes(current_state): |
|
|
"""Save all changes to file""" |
|
|
try: |
|
|
from scraper_common import save_website_config |
|
|
|
|
|
|
|
|
config = current_state if current_state else load_website_config() |
|
|
if not config: |
|
|
return gr.update(), {} |
|
|
|
|
|
|
|
|
success, message = save_website_config(config) |
|
|
|
|
|
if success: |
|
|
|
|
|
updated_config = load_website_config() |
|
|
website_list = list(updated_config.keys()) |
|
|
return (gr.update(choices=website_list), |
|
|
{}) |
|
|
else: |
|
|
return (gr.update(), current_state) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error saving configuration: {str(e)}") |
|
|
return gr.update(), current_state |
|
|
|
|
|
def cancel_changes(): |
|
|
"""Cancel changes and reload from file""" |
|
|
try: |
|
|
|
|
|
config = load_website_config() |
|
|
website_list = list(config.keys()) |
|
|
|
|
|
|
|
|
if website_list: |
|
|
form_values = get_config_for_website(website_list[0]) |
|
|
return (gr.update(choices=website_list, value=website_list[0]), |
|
|
form_values[0], |
|
|
form_values[1], |
|
|
form_values[2], |
|
|
form_values[3], |
|
|
form_values[4], |
|
|
form_values[5], |
|
|
form_values[6], |
|
|
form_values[7], |
|
|
form_values[8], |
|
|
form_values[9], |
|
|
form_values[10], |
|
|
form_values[11], |
|
|
{}) |
|
|
else: |
|
|
return (gr.update(choices=[]), |
|
|
'', '', '', '', '', '', '', '', '0', '', '', '', |
|
|
{}) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error cancelling changes: {str(e)}") |
|
|
return (gr.update(), '', '', '', '', '', '', '', '', '0', '', '', '', {}) |
|
|
|
|
|
|
|
|
website_dropdown.change( |
|
|
fn=get_config_for_website, |
|
|
inputs=[website_dropdown, unsaved_config_state], |
|
|
outputs=[base_url_field, article_links_field, page_links_field, title_field, content_field, date_field, |
|
|
navigation_selector_field, navigation_url_addition_field, start_page_field, |
|
|
pdf_links_field, file_links_field, recaptcha_text_field] |
|
|
) |
|
|
|
|
|
add_website_btn.click( |
|
|
fn=add_new_website, |
|
|
inputs=[new_website_type, base_url_field, article_links_field, page_links_field, title_field, content_field, date_field, |
|
|
navigation_selector_field, navigation_url_addition_field, start_page_field, |
|
|
pdf_links_field, file_links_field, recaptcha_text_field, unsaved_config_state], |
|
|
outputs=[website_dropdown, unsaved_config_state] |
|
|
) |
|
|
|
|
|
update_website_btn.click( |
|
|
fn=update_website, |
|
|
inputs=[website_dropdown, base_url_field, article_links_field, page_links_field, title_field, content_field, date_field, |
|
|
navigation_selector_field, navigation_url_addition_field, start_page_field, |
|
|
pdf_links_field, file_links_field, recaptcha_text_field, unsaved_config_state], |
|
|
outputs=[unsaved_config_state] |
|
|
) |
|
|
|
|
|
delete_website_btn.click( |
|
|
fn=delete_website, |
|
|
inputs=[website_dropdown, unsaved_config_state], |
|
|
outputs=[website_dropdown, unsaved_config_state] |
|
|
) |
|
|
|
|
|
save_all_btn.click( |
|
|
fn=save_all_changes, |
|
|
inputs=[unsaved_config_state], |
|
|
outputs=[website_dropdown, unsaved_config_state] |
|
|
) |
|
|
|
|
|
cancel_btn.click( |
|
|
fn=cancel_changes, |
|
|
outputs=[website_dropdown, base_url_field, article_links_field, page_links_field, title_field, |
|
|
content_field, date_field, navigation_selector_field, navigation_url_addition_field, |
|
|
start_page_field, pdf_links_field, file_links_field, recaptcha_text_field, unsaved_config_state] |
|
|
) |
|
|
|
|
|
|
|
|
def create_main_app(): |
|
|
""" |
|
|
Create the main application with authentication flow |
|
|
""" |
|
|
with gr.Blocks( |
|
|
title="Raagsan Dashboard Web Scrapping", |
|
|
theme=gr.themes.Soft(), |
|
|
css=""" |
|
|
/* Global Container Styles */ |
|
|
.gradio-container { |
|
|
max-width: 1400px !important; |
|
|
margin: 0 auto !important; |
|
|
width: 100% !important; |
|
|
padding: 20px !important; |
|
|
min-height: 100vh !important; |
|
|
} |
|
|
|
|
|
/* Ensure all tabs use full width */ |
|
|
.tabs > .tab-nav, .tabs > .tabitem { |
|
|
max-width: 1400px !important; |
|
|
width: 100% !important; |
|
|
} |
|
|
|
|
|
/* Tab Navigation Styling */ |
|
|
.tab-nav button { |
|
|
border: 2px solid var(--border-color-primary) !important; |
|
|
border-radius: 10px 10px 0 0 !important; |
|
|
margin-right: 5px !important; |
|
|
padding: 12px 24px !important; |
|
|
font-weight: 600 !important; |
|
|
transition: all 0.3s ease !important; |
|
|
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important; |
|
|
} |
|
|
|
|
|
.tab-nav button[aria-selected="true"] { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; |
|
|
color: white !important; |
|
|
border-color: #667eea !important; |
|
|
box-shadow: 0 4px 8px rgba(102, 126, 234, 0.3) !important; |
|
|
} |
|
|
|
|
|
.tab-nav button:hover { |
|
|
transform: translateY(-2px) !important; |
|
|
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15) !important; |
|
|
} |
|
|
|
|
|
/* Tab Content Container */ |
|
|
.tabitem { |
|
|
border: 2px solid var(--border-color-primary) !important; |
|
|
border-radius: 0 10px 10px 10px !important; |
|
|
padding: 30px !important; |
|
|
box-shadow: 0 10px 30px rgba(0, 0, 0, 0.15) !important; |
|
|
margin-top: 0 !important; |
|
|
} |
|
|
|
|
|
/* Ensure rows and columns in all tabs expand to full width */ |
|
|
.gradio-row { |
|
|
width: 100% !important; |
|
|
gap: 20px !important; |
|
|
margin-bottom: 15px !important; |
|
|
} |
|
|
|
|
|
.gradio-column { |
|
|
width: 100% !important; |
|
|
} |
|
|
|
|
|
/* Card Style for Sections */ |
|
|
.gradio-group { |
|
|
border: 2px solid var(--border-color-primary) !important; |
|
|
border-radius: 12px !important; |
|
|
padding: 25px !important; |
|
|
margin: 15px 0 !important; |
|
|
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1) !important; |
|
|
transition: all 0.3s ease !important; |
|
|
} |
|
|
|
|
|
.gradio-group:hover { |
|
|
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2) !important; |
|
|
border-color: #667eea !important; |
|
|
} |
|
|
|
|
|
/* Input Fields Styling */ |
|
|
.gradio-textbox input, .gradio-textbox textarea { |
|
|
border: 2px solid var(--border-color-primary) !important; |
|
|
border-radius: 8px !important; |
|
|
padding: 12px !important; |
|
|
font-size: 14px !important; |
|
|
transition: all 0.3s ease !important; |
|
|
} |
|
|
|
|
|
.gradio-textbox input:focus, .gradio-textbox textarea:focus { |
|
|
border-color: #667eea !important; |
|
|
box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.2) !important; |
|
|
outline: none !important; |
|
|
} |
|
|
|
|
|
/* Dropdown Styling */ |
|
|
.gradio-dropdown { |
|
|
border-radius: 8px !important; |
|
|
} |
|
|
|
|
|
.gradio-dropdown > div { |
|
|
border: 2px solid var(--border-color-primary) !important; |
|
|
border-radius: 8px !important; |
|
|
transition: all 0.3s ease !important; |
|
|
} |
|
|
|
|
|
.gradio-dropdown > div:focus-within { |
|
|
border-color: #667eea !important; |
|
|
box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.2) !important; |
|
|
} |
|
|
|
|
|
/* Button Styling */ |
|
|
button { |
|
|
border-radius: 8px !important; |
|
|
padding: 10px 24px !important; |
|
|
font-weight: 600 !important; |
|
|
transition: all 0.3s ease !important; |
|
|
border: none !important; |
|
|
} |
|
|
|
|
|
button:hover { |
|
|
transform: translateY(-2px) !important; |
|
|
box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15) !important; |
|
|
} |
|
|
|
|
|
button:active { |
|
|
transform: translateY(0) !important; |
|
|
} |
|
|
|
|
|
/* Primary Button */ |
|
|
button[variant="primary"] { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; |
|
|
color: white !important; |
|
|
box-shadow: 0 4px 8px rgba(102, 126, 234, 0.3) !important; |
|
|
} |
|
|
|
|
|
/* Secondary Button */ |
|
|
button[variant="secondary"] { |
|
|
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%) !important; |
|
|
color: white !important; |
|
|
box-shadow: 0 4px 8px rgba(245, 87, 108, 0.3) !important; |
|
|
} |
|
|
|
|
|
/* Stop/Danger Button */ |
|
|
button[variant="stop"] { |
|
|
background: linear-gradient(135deg, #fa709a 0%, #fee140 100%) !important; |
|
|
color: #333 !important; |
|
|
box-shadow: 0 4px 8px rgba(250, 112, 154, 0.3) !important; |
|
|
} |
|
|
|
|
|
/* Dataframe Styling */ |
|
|
.gradio-dataframe { |
|
|
border: 2px solid var(--border-color-primary) !important; |
|
|
border-radius: 12px !important; |
|
|
overflow: hidden !important; |
|
|
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15) !important; |
|
|
} |
|
|
|
|
|
.gradio-dataframe table { |
|
|
border-collapse: separate !important; |
|
|
border-spacing: 0 !important; |
|
|
} |
|
|
|
|
|
.gradio-dataframe th { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; |
|
|
color: white !important; |
|
|
padding: 15px !important; |
|
|
font-weight: 600 !important; |
|
|
text-transform: uppercase !important; |
|
|
font-size: 12px !important; |
|
|
letter-spacing: 0.5px !important; |
|
|
border: 1px solid #667eea !important; |
|
|
} |
|
|
|
|
|
.gradio-dataframe td { |
|
|
padding: 12px 15px !important; |
|
|
border: 1px solid var(--border-color-primary) !important; |
|
|
} |
|
|
|
|
|
.gradio-dataframe tr:hover { |
|
|
background-color: rgba(102, 126, 234, 0.1) !important; |
|
|
} |
|
|
|
|
|
/* Markdown Headings */ |
|
|
h2 { |
|
|
font-weight: 700 !important; |
|
|
margin-bottom: 10px !important; |
|
|
font-size: 24px !important; |
|
|
} |
|
|
|
|
|
h3 { |
|
|
font-weight: 600 !important; |
|
|
margin-bottom: 8px !important; |
|
|
font-size: 18px !important; |
|
|
} |
|
|
|
|
|
/* Login Container */ |
|
|
.login-container { |
|
|
max-width: 500px !important; |
|
|
margin: 50px auto !important; |
|
|
padding: 40px !important; |
|
|
border-radius: 20px !important; |
|
|
border: 2px solid var(--border-color-primary) !important; |
|
|
box-shadow: 0 20px 60px rgba(0, 0, 0, 0.2) !important; |
|
|
} |
|
|
|
|
|
/* Dashboard Header */ |
|
|
.dashboard-header { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; |
|
|
color: white !important; |
|
|
padding: 25px !important; |
|
|
border-radius: 15px !important; |
|
|
margin-bottom: 25px !important; |
|
|
box-shadow: 0 10px 30px rgba(102, 126, 234, 0.3) !important; |
|
|
} |
|
|
|
|
|
.dashboard-header * { |
|
|
color: white !important; |
|
|
} |
|
|
|
|
|
.dashboard-header h1, |
|
|
.dashboard-header h2, |
|
|
.dashboard-header h3, |
|
|
.dashboard-header p, |
|
|
.dashboard-header span, |
|
|
.dashboard-header div { |
|
|
color: white !important; |
|
|
} |
|
|
|
|
|
.header-row { |
|
|
display: flex !important; |
|
|
align-items: center !important; |
|
|
justify-content: space-between !important; |
|
|
gap: 20px !important; |
|
|
} |
|
|
|
|
|
.header-left { |
|
|
flex: 1 !important; |
|
|
} |
|
|
|
|
|
.header-left * { |
|
|
color: white !important; |
|
|
} |
|
|
|
|
|
.header-right { |
|
|
display: flex !important; |
|
|
flex-direction: column !important; |
|
|
align-items: flex-end !important; |
|
|
gap: 10px !important; |
|
|
} |
|
|
|
|
|
.header-right * { |
|
|
color: white !important; |
|
|
} |
|
|
|
|
|
.user-welcome { |
|
|
margin: 0 !important; |
|
|
font-size: 16px !important; |
|
|
font-weight: 500 !important; |
|
|
color: white !important; |
|
|
} |
|
|
|
|
|
.user-welcome * { |
|
|
color: white !important; |
|
|
} |
|
|
|
|
|
.logout-btn { |
|
|
min-width: 100px !important; |
|
|
background: rgba(255, 255, 255, 0.2) !important; |
|
|
backdrop-filter: blur(10px) !important; |
|
|
border: 2px solid white !important; |
|
|
color: white !important; |
|
|
} |
|
|
|
|
|
.logout-btn:hover { |
|
|
background: white !important; |
|
|
color: #667eea !important; |
|
|
} |
|
|
|
|
|
/* Status Messages */ |
|
|
.status-success { |
|
|
color: #28a745 !important; |
|
|
font-weight: bold !important; |
|
|
} |
|
|
|
|
|
.status-error { |
|
|
color: #dc3545 !important; |
|
|
font-weight: bold !important; |
|
|
} |
|
|
|
|
|
/* Admin Panel Specific */ |
|
|
.admin-panel { |
|
|
border: 2px solid var(--border-color-primary) !important; |
|
|
padding: 20px !important; |
|
|
border-radius: 15px !important; |
|
|
margin-top: 20px !important; |
|
|
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15) !important; |
|
|
} |
|
|
|
|
|
.admin-group { |
|
|
border: 2px solid var(--border-color-primary) !important; |
|
|
border-radius: 12px !important; |
|
|
padding: 25px !important; |
|
|
margin: 15px 0 !important; |
|
|
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1) !important; |
|
|
} |
|
|
|
|
|
.admin-section { |
|
|
margin-bottom: 30px !important; |
|
|
border: 2px solid var(--border-color-primary) !important; |
|
|
border-radius: 15px !important; |
|
|
padding: 20px !important; |
|
|
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1) !important; |
|
|
} |
|
|
|
|
|
.admin-button { |
|
|
margin: 5px !important; |
|
|
} |
|
|
|
|
|
/* Label Styling */ |
|
|
label { |
|
|
font-weight: 600 !important; |
|
|
font-size: 14px !important; |
|
|
margin-bottom: 8px !important; |
|
|
} |
|
|
|
|
|
/* Info Text */ |
|
|
.gradio-info { |
|
|
font-size: 12px !important; |
|
|
font-style: italic !important; |
|
|
opacity: 0.8 !important; |
|
|
} |
|
|
|
|
|
/* Download Button */ |
|
|
.download-button { |
|
|
background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%) !important; |
|
|
color: white !important; |
|
|
box-shadow: 0 4px 8px rgba(79, 172, 254, 0.3) !important; |
|
|
} |
|
|
|
|
|
/* Scrollbar Styling */ |
|
|
::-webkit-scrollbar { |
|
|
width: 10px !important; |
|
|
height: 10px !important; |
|
|
} |
|
|
|
|
|
::-webkit-scrollbar-track { |
|
|
background: var(--background-fill-secondary) !important; |
|
|
border-radius: 10px !important; |
|
|
} |
|
|
|
|
|
::-webkit-scrollbar-thumb { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; |
|
|
border-radius: 10px !important; |
|
|
} |
|
|
|
|
|
::-webkit-scrollbar-thumb:hover { |
|
|
background: linear-gradient(135deg, #764ba2 0%, #667eea 100%) !important; |
|
|
} |
|
|
|
|
|
/* Status Textbox Styling */ |
|
|
.gradio-textbox[label="Status"] { |
|
|
border: 2px solid var(--border-color-primary) !important; |
|
|
border-radius: 10px !important; |
|
|
padding: 15px !important; |
|
|
} |
|
|
|
|
|
/* Checkbox Styling */ |
|
|
input[type="checkbox"] { |
|
|
width: 20px !important; |
|
|
height: 20px !important; |
|
|
accent-color: #667eea !important; |
|
|
} |
|
|
|
|
|
/* Markdown Paragraphs */ |
|
|
p { |
|
|
line-height: 1.6 !important; |
|
|
} |
|
|
|
|
|
/* Section Dividers */ |
|
|
hr { |
|
|
border: none !important; |
|
|
height: 2px !important; |
|
|
background: var(--border-color-primary) !important; |
|
|
margin: 30px 0 !important; |
|
|
opacity: 0.3 !important; |
|
|
} |
|
|
|
|
|
/* Better spacing for form elements */ |
|
|
.gradio-form { |
|
|
gap: 15px !important; |
|
|
} |
|
|
|
|
|
/* Hover effects for cards */ |
|
|
.admin-section:hover { |
|
|
transform: translateY(-2px) !important; |
|
|
transition: all 0.3s ease !important; |
|
|
} |
|
|
|
|
|
/* Loading Animation Enhancement */ |
|
|
@keyframes pulse { |
|
|
0%, 100% { opacity: 1; } |
|
|
50% { opacity: 0.5; } |
|
|
} |
|
|
|
|
|
.loading { |
|
|
animation: pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite !important; |
|
|
} |
|
|
|
|
|
/* Improve link styling in markdown */ |
|
|
a { |
|
|
color: #667eea !important; |
|
|
text-decoration: none !important; |
|
|
font-weight: 600 !important; |
|
|
transition: all 0.3s ease !important; |
|
|
} |
|
|
|
|
|
a:hover { |
|
|
color: #764ba2 !important; |
|
|
text-decoration: underline !important; |
|
|
} |
|
|
|
|
|
/* Better spacing for rows within groups */ |
|
|
.gradio-group .gradio-row { |
|
|
margin-bottom: 10px !important; |
|
|
} |
|
|
""" |
|
|
) as main_app: |
|
|
|
|
|
auth_state = gr.State({"authenticated": False, "user": None}) |
|
|
|
|
|
|
|
|
with gr.Column() as main_content: |
|
|
|
|
|
with gr.Row(visible=True) as login_section: |
|
|
with gr.Column(elem_classes="login-container"): |
|
|
gr.Markdown("# Dashboard Login") |
|
|
gr.Markdown("Please login to access Dashboard") |
|
|
|
|
|
with gr.Row(): |
|
|
username_input = gr.Textbox( |
|
|
label="Username", |
|
|
placeholder="Enter your username", |
|
|
interactive=True, |
|
|
scale=2 |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
password_input = gr.Textbox( |
|
|
label="Password", |
|
|
placeholder="Enter your password", |
|
|
type="password", |
|
|
interactive=True, |
|
|
scale=2 |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
login_btn = gr.Button("Login", variant="primary", scale=1) |
|
|
|
|
|
login_status = gr.Textbox( |
|
|
label="Status", |
|
|
value="Ready to login - Enter your credentials above", |
|
|
interactive=False, |
|
|
elem_classes="status-success" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
with gr.Column(visible=False) as dashboard_section: |
|
|
|
|
|
with gr.Column(elem_classes="dashboard-header"): |
|
|
with gr.Row(elem_classes="header-row"): |
|
|
|
|
|
with gr.Column(scale=3, elem_classes="header-left"): |
|
|
gr.Markdown("# Raagsan Dashboard") |
|
|
gr.Markdown("Extract and analyze content from websites and documents (PDF, DOC, CSV).") |
|
|
|
|
|
|
|
|
with gr.Column(scale=1, elem_classes="header-right"): |
|
|
user_info = gr.Markdown("Welcome, Guest", elem_classes="user-welcome") |
|
|
logout_btn = gr.Button("Logout", variant="stop", size="sm", elem_classes="logout-btn") |
|
|
|
|
|
|
|
|
with gr.Tabs(): |
|
|
create_text_content_tab() |
|
|
create_document_content_tab() |
|
|
create_archive_tab() |
|
|
create_keywords_management_tab() |
|
|
create_admin_tab() |
|
|
create_website_config_tab() |
|
|
|
|
|
def handle_login(username, password): |
|
|
"""Handle login attempt""" |
|
|
if not username or not password: |
|
|
return "Please enter both username and password", gr.update(visible=True), gr.update(visible=False), gr.update(visible=False, value="Welcome, Guest") |
|
|
|
|
|
success, message = login_user(username, password) |
|
|
if success: |
|
|
return f"{message}", gr.update(visible=False), gr.update(visible=True), gr.update(visible=True, value=f"Welcome, {username}") |
|
|
else: |
|
|
return f"{message}", gr.update(visible=True), gr.update(visible=False), gr.update(visible=False, value="Welcome, Guest") |
|
|
|
|
|
def handle_logout(): |
|
|
"""Handle logout""" |
|
|
message = logout_user() |
|
|
return f"{message}", gr.update(visible=True), gr.update(visible=False), gr.update(visible=False, value="Welcome, Guest"), gr.update(value=""), gr.update(value="") |
|
|
|
|
|
|
|
|
|
|
|
login_btn.click( |
|
|
fn=handle_login, |
|
|
inputs=[username_input, password_input], |
|
|
outputs=[login_status, login_section, dashboard_section, user_info] |
|
|
) |
|
|
|
|
|
logout_btn.click( |
|
|
fn=handle_logout, |
|
|
outputs=[login_status, login_section, dashboard_section, user_info, username_input, password_input] |
|
|
) |
|
|
|
|
|
|
|
|
return main_app |
|
|
|
|
|
|
|
|
|
|
|
demo = create_main_app() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
auth_manager.cleanup_expired_sessions() |
|
|
|
|
|
|
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
share=False, |
|
|
debug=True |
|
|
) |
|
|
|