Move files, add domain exclusion

2026-03-16 02:49:05 +00:00 · 2025-04-12 12:53:48 -05:00
parent cac52687c8
commit 1cf86dd48b
5 changed files with 59 additions and 8 deletions
--- a/config/history_config.yaml
+++ b/config/history_config.yaml
--- a/src/database.py
+++ b/src/database.py
--- a/src/domain_exclusions.py
+++ b/src/domain_exclusions.py
@@ -0,0 +1,34 @@
+import yaml
+from fnmatch import fnmatch
+
+class DomainExclusions:
+    def __init__(self, config_path="history_config.yaml"):
+        self.excluded_domains = []
+        self.load_config(config_path)
+
+    def load_config(self, config_path):
+        """Load excluded domains from the YAML configuration file."""
+        try:
+            with open(config_path, 'r') as f:
+                config = yaml.safe_load(f)
+
+            # Get the excluded_domains list from config, defaulting to empty list if not found
+            self.excluded_domains = config.get('excluded_domains', [])
+        except FileNotFoundError:
+            print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
+        except yaml.YAMLError as e:
+            print(f"Error parsing YAML configuration: {e}")
+            self.excluded_domains = []
+
+    def is_excluded(self, domain):
+        """
+        Check if a domain matches any of the excluded domain patterns.
+        Supports wildcards (*, ?) in the excluded domain patterns.
+
+        Args:
+            domain (str): The domain to check
+
+        Returns:
+            bool: True if the domain should be excluded, False otherwise
+        """
+        return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains)
--- a/src/logger.py
+++ b/src/logger.py
--- a/src/main.py
+++ b/src/main.py
@@ -4,6 +4,8 @@ from logger import Logger
 import os
 from database import Database
 from crawl4ai import AsyncWebCrawler
+from domain_exclusions import DomainExclusions
+from urllib.parse import urlparse

 # Create logs directory if it doesn't exist
 os.makedirs('logs', exist_ok=True)
@@ -12,6 +14,7 @@ app = FastAPI()
 logger = Logger()

 db = Database()
+domain_exclusions = DomainExclusions()  # Initialize with default config path

@app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket):
@@ -23,32 +26,46 @@ async def websocket_endpoint(websocket: WebSocket):
        try:
            while True:
                data = await websocket.receive_json()
+                url = data["url"]
+                domain = urlparse(url).netloc
+
+                # Check if domain is excluded
+                if domain_exclusions.is_excluded(domain):
+                    logger.info(f"Skipping excluded domain: {domain}")
+                    await websocket.send_json({
+                        "status": "skipped",
+                        "data": {
+                            "url": url,
+                            "title": "Excluded Domain",
+                            "timestamp": data["timestamp"]
+                        }
+                    })
+                    continue

                # Crawl the URL to get title and content
                try:
-                    result = await crawler.arun(url=data["url"])
+                    result = await crawler.arun(url=url)
                    # Get the first result from the container and access metadata
                    crawl_result = result[0]
-                    title = crawl_result.metadata.get('title') or data["url"].split("/")[-1]
+                    title = crawl_result.metadata.get('title') or url.split("/")[-1]
                    content = crawl_result.markdown
-                    logger.info(f"Crawling result: {result}")
                except Exception as crawl_error:
-                    logger.error(f"Crawling error for {data['url']}: {str(crawl_error)}")
-                    title = data["url"].split("/")[-1]
+                    logger.error(f"Crawling error for {url}: {str(crawl_error)}")
+                    title = url.split("/")[-1]
                    content = str(data)

                # Store received data with crawled information
                db.add_history(
-                    url=data["url"],
+                    url=url,
                    title=title,
                    content=content
                )

-                logger.info(f"Processed URL: {data['url']} - {title}")
+                logger.info(f"Processed URL: {url} - {title}")
                await websocket.send_json({
                    "status": "received",
                    "data": {
-                        "url": data["url"],
+                        "url": url,
                        "title": title,
                        "timestamp": data["timestamp"]
                    }