Move files, add domain exclusion

2025-12-06 02:19:37 +00:00 · 2025-04-12 12:53:48 -05:00
parent cac52687c8
commit 1cf86dd48b
5 changed files with 59 additions and 8 deletions
--- a/config/history_config.yaml
+++ b/config/history_config.yaml
--- a/src/database.py
+++ b/src/database.py
--- a/src/domain_exclusions.py
+++ b/src/domain_exclusions.py
@@ -0,0 +1,34 @@
 import yaml
 from fnmatch import fnmatch
 class DomainExclusions:
    def __init__(self, config_path="history_config.yaml"):
        self.excluded_domains = []
        self.load_config(config_path)
    def load_config(self, config_path):
        """Load excluded domains from the YAML configuration file."""
        try:
            with open(config_path, 'r') as f:
                config = yaml.safe_load(f)
            # Get the excluded_domains list from config, defaulting to empty list if not found
            self.excluded_domains = config.get('excluded_domains', [])
        except FileNotFoundError:
            print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
        except yaml.YAMLError as e:
            print(f"Error parsing YAML configuration: {e}")
            self.excluded_domains = []
    def is_excluded(self, domain):
        """
        Check if a domain matches any of the excluded domain patterns.
        Supports wildcards (*, ?) in the excluded domain patterns.
        Args:
            domain (str): The domain to check
        Returns:
            bool: True if the domain should be excluded, False otherwise
        """
        return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains)
--- a/src/logger.py
+++ b/src/logger.py
--- a/src/main.py
+++ b/src/main.py
@@ -4,6 +4,8 @@ from logger import Logger
 import os
 from database import Database
 from crawl4ai import AsyncWebCrawler
 from domain_exclusions import DomainExclusions
 from urllib.parse import urlparse
 # Create logs directory if it doesn't exist
 os.makedirs('logs', exist_ok=True)
@@ -12,6 +14,7 @@ app = FastAPI()
 logger = Logger()
 db = Database()
 domain_exclusions = DomainExclusions()  # Initialize with default config path
@app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket):
@@ -23,32 +26,46 @@ async def websocket_endpoint(websocket: WebSocket):
        try:
            while True:
                data = await websocket.receive_json()
                url = data["url"]
                domain = urlparse(url).netloc
                # Check if domain is excluded
                if domain_exclusions.is_excluded(domain):
                    logger.info(f"Skipping excluded domain: {domain}")
                    await websocket.send_json({
                        "status": "skipped",
                        "data": {
                            "url": url,
                            "title": "Excluded Domain",
                            "timestamp": data["timestamp"]
                        }
                    })
                    continue
                # Crawl the URL to get title and content
                try:
-                    result = await crawler.arun(url=data["url"])
+                    result = await crawler.arun(url=url)
                    # Get the first result from the container and access metadata
                    crawl_result = result[0]
-                    title = crawl_result.metadata.get('title') or data["url"].split("/")[-1]
+                    title = crawl_result.metadata.get('title') or url.split("/")[-1]
                    content = crawl_result.markdown
                    logger.info(f"Crawling result: {result}")
                except Exception as crawl_error:
-                    logger.error(f"Crawling error for {data['url']}: {str(crawl_error)}")
+                    logger.error(f"Crawling error for {url}: {str(crawl_error)}")
-                    title = data["url"].split("/")[-1]
+                    title = url.split("/")[-1]
                    content = str(data)
                # Store received data with crawled information
                db.add_history(
-                    url=data["url"],
+                    url=url,
                    title=title,
                    content=content
                )
-                logger.info(f"Processed URL: {data['url']} - {title}")
+                logger.info(f"Processed URL: {url} - {title}")
                await websocket.send_json({
                    "status": "received",
                    "data": {
-                        "url": data["url"],
+                        "url": url,
                        "title": title,
                        "timestamp": data["timestamp"]
                    }