Raagsan / date_filter.py
iamismail's picture
Initial clean commit for Raagsan Space
439e1dd
"""
Date Filtering Module
Handles date parsing and filtering for articles and documents
"""
import logging
from datetime import datetime
from typing import Optional
import re
from dateutil import parser as date_parser
# Configure logging
logger = logging.getLogger(__name__)
def parse_article_date(date_str: str) -> Optional[datetime]:
"""
Parse article date string into datetime object
Handles various date formats commonly found in scraped articles
Args:
date_str: Date string to parse
Returns:
datetime object if parsing successful, None otherwise
"""
if not date_str or not date_str.strip():
return None
date_str = date_str.strip()
# Try to clean up common prefixes
date_str = re.sub(r'^(Posted on|Published on|Date:|Posted:|Published:)\s*', '', date_str, flags=re.IGNORECASE)
date_str = date_str.strip()
# Try various parsing strategies
try:
# Strategy 1: Use dateutil parser (handles most formats)
try:
parsed_date = date_parser.parse(date_str, fuzzy=True, default=datetime.now().replace(hour=0, minute=0, second=0, microsecond=0))
logger.debug(f"βœ… Successfully parsed date '{date_str}' to {parsed_date}")
return parsed_date
except (ValueError, TypeError) as e:
logger.debug(f"⚠️ dateutil parser failed for '{date_str}': {str(e)}")
# Strategy 2: Try common ISO format patterns
iso_patterns = [
r'(\d{4}-\d{2}-\d{2})', # YYYY-MM-DD
r'(\d{4}/\d{2}/\d{2})', # YYYY/MM/DD
r'(\d{2}-\d{2}-\d{4})', # DD-MM-YYYY
r'(\d{2}/\d{2}/\d{4})', # DD/MM/YYYY
]
for pattern in iso_patterns:
match = re.search(pattern, date_str)
if match:
date_part = match.group(1)
try:
# Try parsing with different separators
if '-' in date_part:
parts = date_part.split('-')
elif '/' in date_part:
parts = date_part.split('/')
else:
continue
if len(parts[0]) == 4: # YYYY-MM-DD or YYYY/MM/DD
year, month, day = int(parts[0]), int(parts[1]), int(parts[2])
parsed_date = datetime(year, month, day)
logger.debug(f"βœ… Successfully parsed date '{date_str}' to {parsed_date} using ISO pattern")
return parsed_date
elif len(parts[2]) == 4: # DD-MM-YYYY or DD/MM/YYYY
day, month, year = int(parts[0]), int(parts[1]), int(parts[2])
parsed_date = datetime(year, month, day)
logger.debug(f"βœ… Successfully parsed date '{date_str}' to {parsed_date} using DD-MM-YYYY pattern")
return parsed_date
except (ValueError, IndexError) as e:
logger.debug(f"⚠️ Failed to parse date part '{date_part}': {str(e)}")
continue
logger.warning(f"⚠️ Could not parse date string: '{date_str}'")
return None
except Exception as e:
logger.error(f"❌ Unexpected error parsing date '{date_str}': {str(e)}")
return None
def standardize_date(date_str: str, default_to_current: bool = False) -> Optional[str]:
"""
Standardize a date string to YYYY-MM-DD format for consistent storage and filtering.
This function takes a date string in any format, parses it, and returns it
in a standardized YYYY-MM-DD format that can be used with the date filter.
Args:
date_str: Date string in any format (e.g., "January 15, 2024", "15/01/2024", "Posted on 2024-01-15")
default_to_current: If True, return current date when parsing fails. If False, return None.
Returns:
Standardized date string in YYYY-MM-DD format, or None if parsing fails (unless default_to_current=True)
Examples:
>>> standardize_date("January 15, 2024")
'2024-01-15'
>>> standardize_date("Posted on 2024-01-15")
'2024-01-15'
>>> standardize_date("15/01/2024")
'2024-01-15'
>>> standardize_date("invalid date")
None
>>> standardize_date("invalid date", default_to_current=True)
'2025-01-07' # Current date
"""
if not date_str or not date_str.strip():
if default_to_current:
return datetime.now().strftime("%Y-%m-%d")
return None
# Parse the date string
parsed_date = parse_article_date(date_str)
if parsed_date is None:
if default_to_current:
logger.warning(f"⚠️ Could not parse date '{date_str}', using current date")
return datetime.now().strftime("%Y-%m-%d")
logger.debug(f"⚠️ Could not standardize date '{date_str}'")
return None
# Return standardized format
standardized = parsed_date.strftime("%Y-%m-%d")
logger.debug(f"βœ… Standardized date '{date_str}' to '{standardized}'")
return standardized
def parse_date_input(date_input: str) -> Optional[datetime]:
"""
Parse date input from UI (expected to be in YYYY-MM-DD format)
Args:
date_input: Date string from UI input (YYYY-MM-DD format)
Returns:
datetime object if parsing successful, None otherwise
"""
if not date_input or not date_input.strip():
return None
date_input = date_input.strip()
try:
# Try parsing as YYYY-MM-DD
parsed_date = datetime.strptime(date_input, "%Y-%m-%d")
logger.debug(f"βœ… Successfully parsed date input '{date_input}' to {parsed_date}")
return parsed_date
except ValueError:
try:
# Try using dateutil as fallback
parsed_date = date_parser.parse(date_input, fuzzy=False)
logger.debug(f"βœ… Successfully parsed date input '{date_input}' to {parsed_date} using dateutil")
return parsed_date
except (ValueError, TypeError) as e:
logger.warning(f"⚠️ Could not parse date input '{date_input}': {str(e)}")
return None
def is_date_in_range(article_date_str: str, start_date: Optional[datetime], end_date: Optional[datetime], include_missing: bool = True) -> bool:
"""
Check if article date falls within the selected date range
Args:
article_date_str: Article date as string
start_date: Start date of range (inclusive), None if no start date
end_date: End date of range (inclusive), None if no end date
include_missing: If True, include articles with missing/invalid dates. If False, exclude them.
Returns:
True if article date is in range (or if no date range provided), False otherwise
"""
# If no date range provided, include all articles
if start_date is None and end_date is None:
return True
# Try to parse article date
article_date = parse_article_date(article_date_str)
# Handle missing/invalid dates
if article_date is None:
logger.debug(f"⚠️ Could not parse article date '{article_date_str}', include_missing={include_missing}")
return include_missing
# Check if date is within range
in_range = True
if start_date is not None:
# Normalize to start of day for comparison
start_normalized = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
article_normalized = article_date.replace(hour=0, minute=0, second=0, microsecond=0)
if article_normalized < start_normalized:
in_range = False
logger.debug(f"πŸ“… Article date {article_normalized} is before start date {start_normalized}")
if end_date is not None and in_range:
# Normalize to end of day for comparison
end_normalized = end_date.replace(hour=23, minute=59, second=59, microsecond=999999)
article_normalized = article_date.replace(hour=0, minute=0, second=0, microsecond=0)
if article_normalized > end_normalized:
in_range = False
logger.debug(f"πŸ“… Article date {article_normalized} is after end date {end_normalized}")
if in_range:
logger.debug(f"βœ… Article date {article_date} is within range [{start_date}, {end_date}]")
return in_range