From 1cf86dd48b63760726fe0f9cc59252ab8159d94c Mon Sep 17 00:00:00 2001
From: Zetaphor <zetaphor@hey.com>
Date: Sat, 12 Apr 2025 12:53:48 -0500
Subject: [PATCH] Move files, add domain exclusion

---
 ...reader_config.yaml => history_config.yaml} |  0
 database.py => src/database.py                |  0
 src/domain_exclusions.py                      | 34 +++++++++++++++++++
 logger.py => src/logger.py                    |  0
 main.py => src/main.py                        | 33 +++++++++++++-----
 5 files changed, 59 insertions(+), 8 deletions(-)
 rename config/{reader_config.yaml => history_config.yaml} (100%)
 rename database.py => src/database.py (100%)
 create mode 100644 src/domain_exclusions.py
 rename logger.py => src/logger.py (100%)
 rename main.py => src/main.py (61%)

diff --git a/config/reader_config.yaml b/config/history_config.yaml
similarity index 100%
rename from config/reader_config.yaml
rename to config/history_config.yaml
diff --git a/database.py b/src/database.py
similarity index 100%
rename from database.py
rename to src/database.py
diff --git a/src/domain_exclusions.py b/src/domain_exclusions.py
new file mode 100644
index 0000000..062a4b6
--- /dev/null
+++ b/src/domain_exclusions.py
@@ -0,0 +1,34 @@
+import yaml
+from fnmatch import fnmatch
+
+class DomainExclusions:
+    def __init__(self, config_path="history_config.yaml"):
+        self.excluded_domains = []
+        self.load_config(config_path)
+
+    def load_config(self, config_path):
+        """Load excluded domains from the YAML configuration file."""
+        try:
+            with open(config_path, 'r') as f:
+                config = yaml.safe_load(f)
+
+            # Get the excluded_domains list from config, defaulting to empty list if not found
+            self.excluded_domains = config.get('excluded_domains', [])
+        except FileNotFoundError:
+            print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
+        except yaml.YAMLError as e:
+            print(f"Error parsing YAML configuration: {e}")
+            self.excluded_domains = []
+
+    def is_excluded(self, domain):
+        """
+        Check if a domain matches any of the excluded domain patterns.
+        Supports wildcards (*, ?) in the excluded domain patterns.
+
+        Args:
+            domain (str): The domain to check
+
+        Returns:
+            bool: True if the domain should be excluded, False otherwise
+        """
+        return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains)
\ No newline at end of file
diff --git a/logger.py b/src/logger.py
similarity index 100%
rename from logger.py
rename to src/logger.py
diff --git a/main.py b/src/main.py
similarity index 61%
rename from main.py
rename to src/main.py
index 3e947a1..34864a4 100644
--- a/main.py
+++ b/src/main.py
@@ -4,6 +4,8 @@ from logger import Logger
 import os
 from database import Database
 from crawl4ai import AsyncWebCrawler
+from domain_exclusions import DomainExclusions
+from urllib.parse import urlparse
 
 # Create logs directory if it doesn't exist
 os.makedirs('logs', exist_ok=True)
@@ -12,6 +14,7 @@ app = FastAPI()
 logger = Logger()
 
 db = Database()
+domain_exclusions = DomainExclusions()  # Initialize with default config path
 
 @app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket):
@@ -23,32 +26,46 @@ async def websocket_endpoint(websocket: WebSocket):
         try:
             while True:
                 data = await websocket.receive_json()
+                url = data["url"]
+                domain = urlparse(url).netloc
+
+                # Check if domain is excluded
+                if domain_exclusions.is_excluded(domain):
+                    logger.info(f"Skipping excluded domain: {domain}")
+                    await websocket.send_json({
+                        "status": "skipped",
+                        "data": {
+                            "url": url,
+                            "title": "Excluded Domain",
+                            "timestamp": data["timestamp"]
+                        }
+                    })
+                    continue
 
                 # Crawl the URL to get title and content
                 try:
-                    result = await crawler.arun(url=data["url"])
+                    result = await crawler.arun(url=url)
                     # Get the first result from the container and access metadata
                     crawl_result = result[0]
-                    title = crawl_result.metadata.get('title') or data["url"].split("/")[-1]
+                    title = crawl_result.metadata.get('title') or url.split("/")[-1]
                     content = crawl_result.markdown
-                    logger.info(f"Crawling result: {result}")
                 except Exception as crawl_error:
-                    logger.error(f"Crawling error for {data['url']}: {str(crawl_error)}")
-                    title = data["url"].split("/")[-1]
+                    logger.error(f"Crawling error for {url}: {str(crawl_error)}")
+                    title = url.split("/")[-1]
                     content = str(data)
 
                 # Store received data with crawled information
                 db.add_history(
-                    url=data["url"],
+                    url=url,
                     title=title,
                     content=content
                 )
 
-                logger.info(f"Processed URL: {data['url']} - {title}")
+                logger.info(f"Processed URL: {url} - {title}")
                 await websocket.send_json({
                     "status": "received",
                     "data": {
-                        "url": data["url"],
+                        "url": url,
                         "title": title,
                         "timestamp": data["timestamp"]
                     }