From 1cf86dd48b63760726fe0f9cc59252ab8159d94c Mon Sep 17 00:00:00 2001 From: Zetaphor Date: Sat, 12 Apr 2025 12:53:48 -0500 Subject: [PATCH] Move files, add domain exclusion --- ...reader_config.yaml => history_config.yaml} | 0 database.py => src/database.py | 0 src/domain_exclusions.py | 34 +++++++++++++++++++ logger.py => src/logger.py | 0 main.py => src/main.py | 33 +++++++++++++----- 5 files changed, 59 insertions(+), 8 deletions(-) rename config/{reader_config.yaml => history_config.yaml} (100%) rename database.py => src/database.py (100%) create mode 100644 src/domain_exclusions.py rename logger.py => src/logger.py (100%) rename main.py => src/main.py (61%) diff --git a/config/reader_config.yaml b/config/history_config.yaml similarity index 100% rename from config/reader_config.yaml rename to config/history_config.yaml diff --git a/database.py b/src/database.py similarity index 100% rename from database.py rename to src/database.py diff --git a/src/domain_exclusions.py b/src/domain_exclusions.py new file mode 100644 index 0000000..062a4b6 --- /dev/null +++ b/src/domain_exclusions.py @@ -0,0 +1,34 @@ +import yaml +from fnmatch import fnmatch + +class DomainExclusions: + def __init__(self, config_path="history_config.yaml"): + self.excluded_domains = [] + self.load_config(config_path) + + def load_config(self, config_path): + """Load excluded domains from the YAML configuration file.""" + try: + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + # Get the excluded_domains list from config, defaulting to empty list if not found + self.excluded_domains = config.get('excluded_domains', []) + except FileNotFoundError: + print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.") + except yaml.YAMLError as e: + print(f"Error parsing YAML configuration: {e}") + self.excluded_domains = [] + + def is_excluded(self, domain): + """ + Check if a domain matches any of the excluded domain patterns. + Supports wildcards (*, ?) in the excluded domain patterns. + + Args: + domain (str): The domain to check + + Returns: + bool: True if the domain should be excluded, False otherwise + """ + return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains) \ No newline at end of file diff --git a/logger.py b/src/logger.py similarity index 100% rename from logger.py rename to src/logger.py diff --git a/main.py b/src/main.py similarity index 61% rename from main.py rename to src/main.py index 3e947a1..34864a4 100644 --- a/main.py +++ b/src/main.py @@ -4,6 +4,8 @@ from logger import Logger import os from database import Database from crawl4ai import AsyncWebCrawler +from domain_exclusions import DomainExclusions +from urllib.parse import urlparse # Create logs directory if it doesn't exist os.makedirs('logs', exist_ok=True) @@ -12,6 +14,7 @@ app = FastAPI() logger = Logger() db = Database() +domain_exclusions = DomainExclusions() # Initialize with default config path @app.websocket("/ws") async def websocket_endpoint(websocket: WebSocket): @@ -23,32 +26,46 @@ async def websocket_endpoint(websocket: WebSocket): try: while True: data = await websocket.receive_json() + url = data["url"] + domain = urlparse(url).netloc + + # Check if domain is excluded + if domain_exclusions.is_excluded(domain): + logger.info(f"Skipping excluded domain: {domain}") + await websocket.send_json({ + "status": "skipped", + "data": { + "url": url, + "title": "Excluded Domain", + "timestamp": data["timestamp"] + } + }) + continue # Crawl the URL to get title and content try: - result = await crawler.arun(url=data["url"]) + result = await crawler.arun(url=url) # Get the first result from the container and access metadata crawl_result = result[0] - title = crawl_result.metadata.get('title') or data["url"].split("/")[-1] + title = crawl_result.metadata.get('title') or url.split("/")[-1] content = crawl_result.markdown - logger.info(f"Crawling result: {result}") except Exception as crawl_error: - logger.error(f"Crawling error for {data['url']}: {str(crawl_error)}") - title = data["url"].split("/")[-1] + logger.error(f"Crawling error for {url}: {str(crawl_error)}") + title = url.split("/")[-1] content = str(data) # Store received data with crawled information db.add_history( - url=data["url"], + url=url, title=title, content=content ) - logger.info(f"Processed URL: {data['url']} - {title}") + logger.info(f"Processed URL: {url} - {title}") await websocket.send_json({ "status": "received", "data": { - "url": data["url"], + "url": url, "title": title, "timestamp": data["timestamp"] }