""" Text Scraper - Handles article and text content processing """ import asyncio import logging import re from datetime import datetime from typing import List, Dict, Any import time # Import common functions from scraper_common from scraper_common import ( WEBSITE_CONFIG, MAX_ARTICLE_LIMIT, MAX_PAGE_LIMIT, convert_to_absolute_url, scraping_cancelled ) # Import keyword filtering utilities from keyword_filter import get_category_for_text # Import date filtering utilities from date_filter import is_date_in_range, standardize_date # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s' ) logger = logging.getLogger(__name__) def construct_navigation_url(base_url: str, nav_addition: str) -> str: """ Construct navigation URL by properly handling trailing slashes and query parameters """ # Remove trailing slash from base URL if it exists if base_url.endswith('/'): base_url = base_url.rstrip('/') # Check if nav_addition starts with / or ? if nav_addition.startswith('/'): # Direct path addition return base_url + nav_addition elif nav_addition.startswith('?'): # Query parameter addition return base_url + nav_addition else: # Default: add as path return base_url + '/' + nav_addition # Global variables for text processing mopnd_article_dates = {} async def get_article_links_with_dates_from_page(page, config: dict, website_type: str) -> List[str]: """ Get article links with dates from a single page (for MOPND) """ try: logger.info(f"🔍 Extracting article links with dates from page for {website_type}") # Get article link selector (check both article_links and page_links for PDF sites) article_selector = config.get("article_links") or config.get("page_links") if not article_selector: logger.warning("⚠️ No article_links or page_links selector found in config") return [] # Get date selector date_selector = config.get("date") if not date_selector: logger.warning("⚠️ No date selector found in config") return [] # Get all article link elements link_elements = await page.query_selector_all(article_selector) logger.info(f"📰 Found {len(link_elements)} article link elements") # Get all date elements date_elements = await page.query_selector_all(date_selector) logger.info(f"📅 Found {len(date_elements)} date elements") # Extract links and dates article_links = [] for i, link_element in enumerate(link_elements): try: # Get the href attribute href = await link_element.get_attribute("href") if href: # Convert to absolute URL absolute_url = convert_to_absolute_url(href, page.url) article_links.append(absolute_url) # Try to get corresponding date (assuming same order) if i < len(date_elements): try: date_text = await date_elements[i].text_content() if date_text and date_text.strip(): # Store the date for this article URL mopnd_article_dates[absolute_url] = date_text.strip() logger.debug(f"✅ Stored date for {absolute_url}: {date_text.strip()}") except Exception as e: logger.debug(f"⚠️ Could not extract date for link {i}: {str(e)}") except Exception as e: logger.warning(f"❌ Error extracting link {i}: {str(e)}") continue logger.info(f"🔗 Extracted {len(article_links)} article links with dates") return article_links except Exception as e: logger.error(f"❌ Error extracting article links with dates: {str(e)}") return [] async def get_all_article_links_unified(page, url: str, config: dict, website_type: str = None) -> List[str]: """ Function to get article links from multiple pages with pagination support Stops when no new (non-repeating) articles are found """ try: logger.info(f"🔍 Getting article links from: {url}") logger.info(f"🌐 Website type: {website_type}") # Check if navigation is configured navigation_selector = config.get("navigation_selector") navigation_url_addition = config.get("navigation_url_addition") start_page = config.get("start_page", 1) all_article_links = [] seen_links = set() # Track unique links to detect duplicates current_page = start_page consecutive_empty_pages = 0 max_consecutive_empty = 2 # Stop after 2 consecutive pages with no new content # Navigate to the initial page await page.goto(url, wait_until="domcontentloaded", timeout=30000) # Handle pagination if configured if navigation_selector and navigation_url_addition: logger.info(f"🧭 Navigation configured: selector={navigation_selector}, url_addition={navigation_url_addition}") logger.info(f"📄 Starting from page: {start_page}") while True: logger.info(f"📄 Processing page {current_page}") # Check MAX_PAGE_LIMIT if set if MAX_PAGE_LIMIT is not None and current_page > MAX_PAGE_LIMIT: logger.info(f"🛑 Reached MAX_PAGE_LIMIT ({MAX_PAGE_LIMIT}), stopping pagination") break # Navigate to current page if not the first page if current_page > start_page: nav_url_addition = navigation_url_addition.replace("{page_no}", str(current_page)) nav_url = construct_navigation_url(url, nav_url_addition) logger.info(f"🧭 Navigating to: {nav_url}") await page.goto(nav_url, wait_until="domcontentloaded", timeout=30000) # Check if navigation element exists for next page nav_element = await page.query_selector(navigation_selector) if current_page == start_page and nav_element: logger.info("✅ Navigation element found, more pages available") elif current_page > start_page and not nav_element: logger.info("📄 No more navigation elements found, stopping pagination") break # Extract links from current page page_links = await extract_links_from_current_page(page, config, website_type) if page_links: # Check for new (non-duplicate) links new_links = [] for link in page_links: if link not in seen_links: seen_links.add(link) new_links.append(link) if new_links: all_article_links.extend(new_links) consecutive_empty_pages = 0 # Reset counter logger.info(f"📰 Found {len(new_links)} new links on page {current_page} (total: {len(page_links)} links on page)") else: consecutive_empty_pages += 1 logger.info(f"📰 No new links found on page {current_page} (all {len(page_links)} links were duplicates)") # Stop if we've had too many consecutive pages with no new content if consecutive_empty_pages >= max_consecutive_empty: logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no new content") break else: consecutive_empty_pages += 1 logger.info(f"📰 No links found on page {current_page}") # Stop if we've had too many consecutive pages with no content if consecutive_empty_pages >= max_consecutive_empty: logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no content") break current_page += 1 else: # No pagination configured, scrape single page only logger.info("📄 No navigation configured - scraping single page only") page_links = await extract_links_from_current_page(page, config, website_type) all_article_links.extend(page_links) logger.info(f"📊 Total unique article links found across all pages: {len(all_article_links)}") return all_article_links except Exception as e: logger.error(f"❌ Error getting article links: {str(e)}") return [] async def extract_links_from_current_page(page, config: dict, website_type: str) -> List[str]: """ Extract article links from the current page """ try: # For MOPND, use special function to get links with dates if website_type == "mopnd": return await get_article_links_with_dates_from_page(page, config, website_type) else: # Regular article link extraction (check both article_links and page_links for PDF sites) article_selector = config.get("article_links") or config.get("page_links") if not article_selector: logger.warning("⚠️ No article_links or page_links selector found in config") return [] # Handle different selector types if isinstance(article_selector, list): # If it's a list, use the first selector article_selector = article_selector[0] logger.info(f"📝 Using first selector from list: {article_selector}") elif not isinstance(article_selector, str): logger.error(f"❌ Invalid selector type: {type(article_selector)}. Expected string or list.") return [] # Get all article link elements link_elements = await page.query_selector_all(article_selector) logger.info(f"📰 Found {len(link_elements)} article link elements on current page") # Extract links page_links = [] for i, link_element in enumerate(link_elements): try: # First try to get href directly from the element href = await link_element.get_attribute("href") # If no href found, try to find a parent link element if not href: parent_link = await link_element.query_selector("a") if parent_link: href = await parent_link.get_attribute("href") # If still no href, try to find a parent element with href if not href: try: # Try to find a parent link element parent_link = await link_element.evaluate(""" (element) => { let current = element; for (let i = 0; i < 5; i++) { if (current.tagName === 'A' && current.href) { return current.href; } current = current.parentElement; if (!current) break; } return null; } """) if parent_link: href = parent_link except Exception as e: logger.debug(f"Could not find parent link: {e}") if href: absolute_url = convert_to_absolute_url(href, page.url) page_links.append(absolute_url) else: logger.warning(f"⚠️ No href found for element {i}") except Exception as e: logger.warning(f"❌ Error extracting link {i}: {str(e)}") continue return page_links except Exception as e: logger.error(f"❌ Error extracting links from current page: {str(e)}") return [] async def extract_all_articles_unified(page, article_links: List[str], config: dict, website_type: str = None, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> List[dict]: """ Unified function to extract content from all articles Limited by MAX_ARTICLE_LIMIT if set """ logger.info(f"📚 Starting article extraction for {len(article_links)} articles") logger.debug(f"🔧 Website type: {website_type}, Article limit: {MAX_ARTICLE_LIMIT}") all_articles = [] # Apply article limit if set if MAX_ARTICLE_LIMIT is not None: if len(article_links) > MAX_ARTICLE_LIMIT: logger.info(f"📊 Limiting to first {MAX_ARTICLE_LIMIT} articles out of {len(article_links)} total") article_links = article_links[:MAX_ARTICLE_LIMIT] logger.info(f"🎯 Processing {len(article_links)} articles") for i, link in enumerate(article_links): if scraping_cancelled(): logger.info("🛑 Scraping cancelled, stopping article extraction") break logger.info(f"📰 Processing article {i+1}/{len(article_links)}: {link}") try: # Add timeout to prevent hanging with retry mechanism import asyncio # Try with shorter timeout first try: article_data = await asyncio.wait_for( extract_article_content_unified(page, link, config, website_type, custom_keywords, start_date, end_date), timeout=60 # 1 minute timeout per article ) if article_data is not None: # Only append if content was extracted and matched keywords/date all_articles.append(article_data) else: logger.info(f"📄 Skipped article {i+1} (no content, no keyword match, or date out of range): {link}") except asyncio.TimeoutError: logger.warning(f"First attempt timeout for article {i+1}, retrying with shorter timeout...") # Retry with even shorter timeout try: article_data = await asyncio.wait_for( extract_article_content_unified(page, link, config, website_type, custom_keywords, start_date, end_date), timeout=30 # 30 seconds timeout for retry ) if article_data is not None: # Only append if content was extracted and matched keywords/date all_articles.append(article_data) else: logger.info(f"📄 Skipped article {i+1} (no content, no keyword match, or date out of range): {link}") except asyncio.TimeoutError: logger.error(f"Timeout extracting article {i+1} after retry: {link}") all_articles.append({ "title": f"Timeout extracting article {i+1}", "content": f"Article extraction timed out after multiple attempts: {link}", "date": datetime.now().strftime("%Y-%m-%d"), "url": link }) except Exception as e: logger.error(f"Error extracting article {i+1}: {str(e)}") all_articles.append({ "title": f"Error extracting article {i+1}", "content": f"Error extracting article: {str(e)}", "date": datetime.now().strftime("%Y-%m-%d"), "url": link }) except Exception as e: logger.error(f"Unexpected error processing article {i+1}: {str(e)}") all_articles.append({ "title": f"Error processing article {i+1}", "content": f"Unexpected error: {str(e)}", "date": datetime.now().strftime("%Y-%m-%d"), "url": link }) return all_articles async def extract_article_content_unified(page, article_url: str, config: dict, website_type: str = None, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> dict: """ Unified function to extract content from a single article (text-focused) With 5 retry attempts for loading articles """ try: max_retries = 5 retry_count = 0 while retry_count < max_retries: try: retry_count += 1 logger.info(f"🔄 Loading article (attempt {retry_count}/{max_retries}): {article_url}") # Navigate to article with different strategies if retry_count == 1: # First attempt: Use domcontentloaded for faster loading await page.goto(article_url, wait_until="domcontentloaded", timeout=30000) elif retry_count == 2: # Second attempt: Use basic loading with shorter timeout await page.goto(article_url, timeout=20000) elif retry_count == 3: # Third attempt: Use networkidle with even shorter timeout await page.goto(article_url, wait_until="networkidle", timeout=15000) else: # Fourth and fifth attempts: Try with shorter timeouts await page.goto(article_url, timeout=10000) logger.info(f"✅ Successfully loaded article on attempt {retry_count}") break # Success, exit retry loop except Exception as e: logger.warning(f"⚠️ Attempt {retry_count} failed for {article_url}: {str(e)}") if retry_count >= max_retries: logger.error(f"❌ Failed to load article after {max_retries} attempts: {article_url}") return { "title": "Network Error", "content": f"Failed to access article after {max_retries} attempts: {str(e)}", "date": datetime.now().strftime("%Y-%m-%d"), "url": article_url } # Wait before retry import asyncio await asyncio.sleep(2) # Wait 2 seconds before retry # Extract title title = "" try: title_element = await page.query_selector(config.get("title")) if title_element: title = await title_element.text_content() if title: title = title.strip() except Exception as e: logger.warning(f"Error extracting title: {str(e)}") title = "" # Use the passed website_type or try to determine it from config if website_type is None: for site_type, site_config in WEBSITE_CONFIG.items(): if site_config == config: website_type = site_type break if website_type is None: website_type = "unknown" content = "" # Extract content based on website type if website_type == "hiiraan": # Special handling for hiiraan.com content_selector = config.get("content") try: # Get the content directly from the span content_element = await page.query_selector(content_selector) if content_element: # Get inner HTML and clean it up html_content = await content_element.inner_html() # Remove script tags and their contents html_content = re.sub(r'', '', html_content, flags=re.DOTALL) # Remove ads html_content = re.sub(r'
.*?
', '', html_content, flags=re.DOTALL) # Extract text from HTML content = re.sub(r'<.*?>', ' ', html_content) content = re.sub(r'\s+', ' ', content).strip() except Exception as e: logger.warning(f"Error extracting hiiraan content: {str(e)}") content = "" else: # Regular content extraction content_selector = config.get("content") content = "" try: content_elements = await page.query_selector_all(content_selector) content_parts = [] for element in content_elements: text = await element.text_content() if text: content_parts.append(text.strip()) content = "\n\n".join(content_parts) except Exception as e: logger.warning(f"Error extracting content: {str(e)}") content = "" # Extract date using configuration selector date_raw = "" # For MOPND, use the date extracted from the main page if website_type == "mopnd" and article_url in mopnd_article_dates: date_raw = mopnd_article_dates[article_url] logger.debug(f"✅ Using MOPND date from main page: {date_raw}") else: # Regular date extraction for other websites date_selector = config.get("date") if date_selector: try: date_element = await page.query_selector(date_selector) if date_element: date_raw = await date_element.text_content() if date_raw: date_raw = date_raw.strip() logger.debug(f"✅ Extracted raw date: {date_raw}") except Exception as e: logger.warning(f"Error extracting date with selector {date_selector}: {str(e)}") # Standardize the date to YYYY-MM-DD format date = standardize_date(date_raw, default_to_current=True) if not date: date = datetime.now().strftime("%Y-%m-%d") logger.info(f"No date found with config selector, using current date: {date}") # Check date range filtering from date_filter import parse_date_input start_dt = parse_date_input(start_date) if start_date else None end_dt = parse_date_input(end_date) if end_date else None if start_dt is not None or end_dt is not None: if not is_date_in_range(date, start_dt, end_dt, include_missing=False): logger.info(f"📅 Article date {date} is outside date range [{start_date}, {end_date}] - filtering out") return None # Check for keyword matching and category assignment combined_text = f"{title} {content}".strip() category = get_category_for_text(combined_text, custom_keywords) if category is None: logger.info("📂 Article did not match any keyword categories - filtering out") return None elif category: logger.info(f"📂 Article categorized as: {category}") else: logger.info("📂 Article kept with empty category") result = { "title": title or "No title found", "content": content or "No content found", "date": date, "url": article_url, "category": category } logger.info(f"📊 Article result: title='{result['title'][:50]}...', category='{category}'") return result except Exception as e: logger.error(f"Error extracting content from {article_url}: {str(e)}") return { "title": "Error", "content": f"Error extracting content: {str(e)}", "date": datetime.now().strftime("%Y-%m-%d"), "url": article_url }