|
|
""" |
|
|
Date Filtering Module |
|
|
Handles date parsing and filtering for articles and documents |
|
|
""" |
|
|
|
|
|
import logging |
|
|
from datetime import datetime |
|
|
from typing import Optional |
|
|
import re |
|
|
from dateutil import parser as date_parser |
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def parse_article_date(date_str: str) -> Optional[datetime]: |
|
|
""" |
|
|
Parse article date string into datetime object |
|
|
Handles various date formats commonly found in scraped articles |
|
|
|
|
|
Args: |
|
|
date_str: Date string to parse |
|
|
|
|
|
Returns: |
|
|
datetime object if parsing successful, None otherwise |
|
|
""" |
|
|
if not date_str or not date_str.strip(): |
|
|
return None |
|
|
|
|
|
date_str = date_str.strip() |
|
|
|
|
|
|
|
|
date_str = re.sub(r'^(Posted on|Published on|Date:|Posted:|Published:)\s*', '', date_str, flags=re.IGNORECASE) |
|
|
date_str = date_str.strip() |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
try: |
|
|
parsed_date = date_parser.parse(date_str, fuzzy=True, default=datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)) |
|
|
logger.debug(f"β
Successfully parsed date '{date_str}' to {parsed_date}") |
|
|
return parsed_date |
|
|
except (ValueError, TypeError) as e: |
|
|
logger.debug(f"β οΈ dateutil parser failed for '{date_str}': {str(e)}") |
|
|
|
|
|
|
|
|
iso_patterns = [ |
|
|
r'(\d{4}-\d{2}-\d{2})', |
|
|
r'(\d{4}/\d{2}/\d{2})', |
|
|
r'(\d{2}-\d{2}-\d{4})', |
|
|
r'(\d{2}/\d{2}/\d{4})', |
|
|
] |
|
|
|
|
|
for pattern in iso_patterns: |
|
|
match = re.search(pattern, date_str) |
|
|
if match: |
|
|
date_part = match.group(1) |
|
|
try: |
|
|
|
|
|
if '-' in date_part: |
|
|
parts = date_part.split('-') |
|
|
elif '/' in date_part: |
|
|
parts = date_part.split('/') |
|
|
else: |
|
|
continue |
|
|
|
|
|
if len(parts[0]) == 4: |
|
|
year, month, day = int(parts[0]), int(parts[1]), int(parts[2]) |
|
|
parsed_date = datetime(year, month, day) |
|
|
logger.debug(f"β
Successfully parsed date '{date_str}' to {parsed_date} using ISO pattern") |
|
|
return parsed_date |
|
|
elif len(parts[2]) == 4: |
|
|
day, month, year = int(parts[0]), int(parts[1]), int(parts[2]) |
|
|
parsed_date = datetime(year, month, day) |
|
|
logger.debug(f"β
Successfully parsed date '{date_str}' to {parsed_date} using DD-MM-YYYY pattern") |
|
|
return parsed_date |
|
|
except (ValueError, IndexError) as e: |
|
|
logger.debug(f"β οΈ Failed to parse date part '{date_part}': {str(e)}") |
|
|
continue |
|
|
|
|
|
logger.warning(f"β οΈ Could not parse date string: '{date_str}'") |
|
|
return None |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Unexpected error parsing date '{date_str}': {str(e)}") |
|
|
return None |
|
|
|
|
|
|
|
|
def standardize_date(date_str: str, default_to_current: bool = False) -> Optional[str]: |
|
|
""" |
|
|
Standardize a date string to YYYY-MM-DD format for consistent storage and filtering. |
|
|
|
|
|
This function takes a date string in any format, parses it, and returns it |
|
|
in a standardized YYYY-MM-DD format that can be used with the date filter. |
|
|
|
|
|
Args: |
|
|
date_str: Date string in any format (e.g., "January 15, 2024", "15/01/2024", "Posted on 2024-01-15") |
|
|
default_to_current: If True, return current date when parsing fails. If False, return None. |
|
|
|
|
|
Returns: |
|
|
Standardized date string in YYYY-MM-DD format, or None if parsing fails (unless default_to_current=True) |
|
|
|
|
|
Examples: |
|
|
>>> standardize_date("January 15, 2024") |
|
|
'2024-01-15' |
|
|
>>> standardize_date("Posted on 2024-01-15") |
|
|
'2024-01-15' |
|
|
>>> standardize_date("15/01/2024") |
|
|
'2024-01-15' |
|
|
>>> standardize_date("invalid date") |
|
|
None |
|
|
>>> standardize_date("invalid date", default_to_current=True) |
|
|
'2025-01-07' # Current date |
|
|
""" |
|
|
if not date_str or not date_str.strip(): |
|
|
if default_to_current: |
|
|
return datetime.now().strftime("%Y-%m-%d") |
|
|
return None |
|
|
|
|
|
|
|
|
parsed_date = parse_article_date(date_str) |
|
|
|
|
|
if parsed_date is None: |
|
|
if default_to_current: |
|
|
logger.warning(f"β οΈ Could not parse date '{date_str}', using current date") |
|
|
return datetime.now().strftime("%Y-%m-%d") |
|
|
logger.debug(f"β οΈ Could not standardize date '{date_str}'") |
|
|
return None |
|
|
|
|
|
|
|
|
standardized = parsed_date.strftime("%Y-%m-%d") |
|
|
logger.debug(f"β
Standardized date '{date_str}' to '{standardized}'") |
|
|
return standardized |
|
|
|
|
|
|
|
|
def parse_date_input(date_input: str) -> Optional[datetime]: |
|
|
""" |
|
|
Parse date input from UI (expected to be in YYYY-MM-DD format) |
|
|
|
|
|
Args: |
|
|
date_input: Date string from UI input (YYYY-MM-DD format) |
|
|
|
|
|
Returns: |
|
|
datetime object if parsing successful, None otherwise |
|
|
""" |
|
|
if not date_input or not date_input.strip(): |
|
|
return None |
|
|
|
|
|
date_input = date_input.strip() |
|
|
|
|
|
try: |
|
|
|
|
|
parsed_date = datetime.strptime(date_input, "%Y-%m-%d") |
|
|
logger.debug(f"β
Successfully parsed date input '{date_input}' to {parsed_date}") |
|
|
return parsed_date |
|
|
except ValueError: |
|
|
try: |
|
|
|
|
|
parsed_date = date_parser.parse(date_input, fuzzy=False) |
|
|
logger.debug(f"β
Successfully parsed date input '{date_input}' to {parsed_date} using dateutil") |
|
|
return parsed_date |
|
|
except (ValueError, TypeError) as e: |
|
|
logger.warning(f"β οΈ Could not parse date input '{date_input}': {str(e)}") |
|
|
return None |
|
|
|
|
|
|
|
|
def is_date_in_range(article_date_str: str, start_date: Optional[datetime], end_date: Optional[datetime], include_missing: bool = True) -> bool: |
|
|
""" |
|
|
Check if article date falls within the selected date range |
|
|
|
|
|
Args: |
|
|
article_date_str: Article date as string |
|
|
start_date: Start date of range (inclusive), None if no start date |
|
|
end_date: End date of range (inclusive), None if no end date |
|
|
include_missing: If True, include articles with missing/invalid dates. If False, exclude them. |
|
|
|
|
|
Returns: |
|
|
True if article date is in range (or if no date range provided), False otherwise |
|
|
""" |
|
|
|
|
|
if start_date is None and end_date is None: |
|
|
return True |
|
|
|
|
|
|
|
|
article_date = parse_article_date(article_date_str) |
|
|
|
|
|
|
|
|
if article_date is None: |
|
|
logger.debug(f"β οΈ Could not parse article date '{article_date_str}', include_missing={include_missing}") |
|
|
return include_missing |
|
|
|
|
|
|
|
|
in_range = True |
|
|
|
|
|
if start_date is not None: |
|
|
|
|
|
start_normalized = start_date.replace(hour=0, minute=0, second=0, microsecond=0) |
|
|
article_normalized = article_date.replace(hour=0, minute=0, second=0, microsecond=0) |
|
|
if article_normalized < start_normalized: |
|
|
in_range = False |
|
|
logger.debug(f"π
Article date {article_normalized} is before start date {start_normalized}") |
|
|
|
|
|
if end_date is not None and in_range: |
|
|
|
|
|
end_normalized = end_date.replace(hour=23, minute=59, second=59, microsecond=999999) |
|
|
article_normalized = article_date.replace(hour=0, minute=0, second=0, microsecond=0) |
|
|
if article_normalized > end_normalized: |
|
|
in_range = False |
|
|
logger.debug(f"π
Article date {article_normalized} is after end date {end_normalized}") |
|
|
|
|
|
if in_range: |
|
|
logger.debug(f"β
Article date {article_date} is within range [{start_date}, {end_date}]") |
|
|
|
|
|
return in_range |
|
|
|
|
|
|