course-creator-ai

Sleeping

App Files Files Community

sizzlebop commited on Jun 11

Commit

8be2f43

verified ·

1 Parent(s): 4d85aba

Upload 34 files

Browse files

Files changed (2) hide show

coursecrafter/tools/__pycache__/web_research.cpython-311.pyc +0 -0
coursecrafter/tools/web_research.py +135 -9

coursecrafter/tools/__pycache__/web_research.cpython-311.pyc CHANGED Viewed

Binary files a/coursecrafter/tools/__pycache__/web_research.cpython-311.pyc and b/coursecrafter/tools/__pycache__/web_research.cpython-311.pyc differ

coursecrafter/tools/web_research.py CHANGED Viewed

@@ -4,14 +4,23 @@ Advanced web research using DuckDuckGo search and Crawl4AI content extraction
 """
 import os
 from typing import List, Dict, Any, Optional
 from duckduckgo_search import DDGS
-from crawl4ai import (
-    AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig,
-    LLMContentFilter, DefaultMarkdownGenerator
-)
 import logging
 logger = logging.getLogger(__name__)
@@ -22,11 +31,16 @@ class WebResearcher:
         self.max_results = max_results
         self.max_crawl_pages = max_crawl_pages
         self.llm_provider = llm_provider or "openai"  # Default fallback
-        self.browser_config = BrowserConfig(
-            headless=True,
-            viewport_width=1280,
-            viewport_height=720
-        )
     async def search_topic(self, topic: str, region: str = "us-en") -> List[Dict[str, Any]]:
         """Search for a topic using DuckDuckGo"""
@@ -58,8 +72,103 @@ class WebResearcher:
             print(f"❌ Search failed: {e}")
             return []
     async def extract_content(self, urls: List[str], topic: str) -> List[Dict[str, Any]]:
         """Extract content from URLs using Crawl4AI with LLM filtering"""
         try:
             print(f"📄 Extracting content from {len(urls)} URLs...")
@@ -192,6 +301,23 @@ class WebResearcher:
         except Exception as e:
             logger.error(f"Content extraction failed: {e}")
             print(f"❌ Content extraction failed: {e}")
             return []
     async def research_topic(self, topic: str) -> Dict[str, Any]:

 """
 import os
+import requests
 from typing import List, Dict, Any, Optional
 from duckduckgo_search import DDGS
+from bs4 import BeautifulSoup
 import logging
+# Try to import Crawl4AI, but have a fallback if it fails
+try:
+    from crawl4ai import (
+        AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig,
+        LLMContentFilter, DefaultMarkdownGenerator
+    )
+    CRAWL4AI_AVAILABLE = True
+except ImportError:
+    CRAWL4AI_AVAILABLE = False
+    print("⚠️ Crawl4AI not available, using fallback web scraping")
 logger = logging.getLogger(__name__)
         self.max_results = max_results
         self.max_crawl_pages = max_crawl_pages
         self.llm_provider = llm_provider or "openai"  # Default fallback
+        if CRAWL4AI_AVAILABLE:
+            self.browser_config = BrowserConfig(
+                headless=True,
+                viewport_width=1280,
+                viewport_height=720
+            )
+        else:
+            self.browser_config = None
+            print("🔄 Using fallback web scraping (requests + BeautifulSoup)")
     async def search_topic(self, topic: str, region: str = "us-en") -> List[Dict[str, Any]]:
         """Search for a topic using DuckDuckGo"""
             print(f"❌ Search failed: {e}")
             return []
+    async def _fallback_extract_content(self, urls: List[str]) -> List[Dict[str, Any]]:
+        """Fallback content extraction using requests and BeautifulSoup"""
+        extracted_content = []
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        for i, url in enumerate(urls[:self.max_crawl_pages]):
+            try:
+                print(f"📖 Scraping {i+1}/{min(len(urls), self.max_crawl_pages)}: {url}")
+                response = requests.get(url, headers=headers, timeout=10)
+                response.raise_for_status()
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Remove script and style elements
+                for script in soup(["script", "style", "nav", "footer", "header"]):
+                    script.decompose()
+                # Extract title
+                title = ""
+                if soup.title:
+                    title = soup.title.string.strip()
+                # Extract main content
+                content_selectors = [
+                    'main', 'article', '.content', '#content',
+                    '.post-content', '.entry-content', '.article-content'
+                ]
+                content = ""
+                for selector in content_selectors:
+                    content_elem = soup.select_one(selector)
+                    if content_elem:
+                        content = content_elem.get_text(separator='\n', strip=True)
+                        break
+                # If no specific content area found, use body
+                if not content:
+                    content = soup.get_text(separator='\n', strip=True)
+                # Clean up content
+                lines = [line.strip() for line in content.split('\n') if line.strip()]
+                content = '\n'.join(lines)
+                word_count = len(content.split())
+                extracted_content.append({
+                    "url": url,
+                    "title": title,
+                    "content": content,
+                    "word_count": word_count,
+                    "extraction_success": True
+                })
+                print(f"✅ Extracted {word_count} words from {url}")
+            except Exception as e:
+                logger.error(f"Error scraping {url}: {e}")
+                print(f"❌ Error scraping {url}: {e}")
+                extracted_content.append({
+                    "url": url,
+                    "title": "",
+                    "content": "",
+                    "word_count": 0,
+                    "extraction_success": False,
+                    "error": str(e)
+                })
+        successful_extractions = [c for c in extracted_content if c["extraction_success"]]
+        print(f"✅ Successfully extracted content from {len(successful_extractions)}/{len(urls)} URLs")
+        return extracted_content
     async def extract_content(self, urls: List[str], topic: str) -> List[Dict[str, Any]]:
         """Extract content from URLs using Crawl4AI with LLM filtering"""
+        # If Crawl4AI is not available, use fallback immediately
+        if not CRAWL4AI_AVAILABLE:
+            print("🔄 Using fallback content extraction (Crawl4AI not available)")
+            return await self._fallback_extract_content(urls)
+        # Check if Playwright browsers are installed
+        try:
+            from playwright.async_api import async_playwright
+            async with async_playwright() as p:
+                # Try to get browser path - this will fail if browsers aren't installed
+                browser_path = p.chromium.executable_path
+                if not browser_path or not os.path.exists(browser_path):
+                    print("🔄 Playwright browsers not installed, using fallback content extraction")
+                    return await self._fallback_extract_content(urls)
+        except Exception as e:
+            print(f"🔄 Playwright check failed ({e}), using fallback content extraction")
+            return await self._fallback_extract_content(urls)
         try:
             print(f"📄 Extracting content from {len(urls)} URLs...")
         except Exception as e:
             logger.error(f"Content extraction failed: {e}")
             print(f"❌ Content extraction failed: {e}")
+            # If Crawl4AI fails (likely due to Playwright), try fallback
+            error_str = str(e)
+            playwright_errors = [
+                "Executable doesn't exist",
+                "BrowserType.launch",
+                "playwright install",
+                "Playwright was just installed",
+                "download new browsers",
+                "chromium-",
+                "chrome-linux/chrome"
+            ]
+            if any(error in error_str for error in playwright_errors):
+                print("🔄 Playwright browser binaries not available, falling back to simple web scraping")
+                return await self._fallback_extract_content(urls)
             return []
     async def research_topic(self, topic: str) -> Dict[str, Any]: