Move files, add domain exclusion

This commit is contained in:
2025-04-12 12:53:48 -05:00
parent cac52687c8
commit 1cf86dd48b
5 changed files with 59 additions and 8 deletions

34
src/domain_exclusions.py Normal file
View File

@@ -0,0 +1,34 @@
import yaml
from fnmatch import fnmatch
class DomainExclusions:
def __init__(self, config_path="history_config.yaml"):
self.excluded_domains = []
self.load_config(config_path)
def load_config(self, config_path):
"""Load excluded domains from the YAML configuration file."""
try:
with open(config_path, 'r') as f:
config = yaml.safe_load(f)
# Get the excluded_domains list from config, defaulting to empty list if not found
self.excluded_domains = config.get('excluded_domains', [])
except FileNotFoundError:
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
except yaml.YAMLError as e:
print(f"Error parsing YAML configuration: {e}")
self.excluded_domains = []
def is_excluded(self, domain):
"""
Check if a domain matches any of the excluded domain patterns.
Supports wildcards (*, ?) in the excluded domain patterns.
Args:
domain (str): The domain to check
Returns:
bool: True if the domain should be excluded, False otherwise
"""
return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains)

View File

@@ -4,6 +4,8 @@ from logger import Logger
import os import os
from database import Database from database import Database
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler
from domain_exclusions import DomainExclusions
from urllib.parse import urlparse
# Create logs directory if it doesn't exist # Create logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True) os.makedirs('logs', exist_ok=True)
@@ -12,6 +14,7 @@ app = FastAPI()
logger = Logger() logger = Logger()
db = Database() db = Database()
domain_exclusions = DomainExclusions() # Initialize with default config path
@app.websocket("/ws") @app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket): async def websocket_endpoint(websocket: WebSocket):
@@ -23,32 +26,46 @@ async def websocket_endpoint(websocket: WebSocket):
try: try:
while True: while True:
data = await websocket.receive_json() data = await websocket.receive_json()
url = data["url"]
domain = urlparse(url).netloc
# Check if domain is excluded
if domain_exclusions.is_excluded(domain):
logger.info(f"Skipping excluded domain: {domain}")
await websocket.send_json({
"status": "skipped",
"data": {
"url": url,
"title": "Excluded Domain",
"timestamp": data["timestamp"]
}
})
continue
# Crawl the URL to get title and content # Crawl the URL to get title and content
try: try:
result = await crawler.arun(url=data["url"]) result = await crawler.arun(url=url)
# Get the first result from the container and access metadata # Get the first result from the container and access metadata
crawl_result = result[0] crawl_result = result[0]
title = crawl_result.metadata.get('title') or data["url"].split("/")[-1] title = crawl_result.metadata.get('title') or url.split("/")[-1]
content = crawl_result.markdown content = crawl_result.markdown
logger.info(f"Crawling result: {result}")
except Exception as crawl_error: except Exception as crawl_error:
logger.error(f"Crawling error for {data['url']}: {str(crawl_error)}") logger.error(f"Crawling error for {url}: {str(crawl_error)}")
title = data["url"].split("/")[-1] title = url.split("/")[-1]
content = str(data) content = str(data)
# Store received data with crawled information # Store received data with crawled information
db.add_history( db.add_history(
url=data["url"], url=url,
title=title, title=title,
content=content content=content
) )
logger.info(f"Processed URL: {data['url']} - {title}") logger.info(f"Processed URL: {url} - {title}")
await websocket.send_json({ await websocket.send_json({
"status": "received", "status": "received",
"data": { "data": {
"url": data["url"], "url": url,
"title": title, "title": title,
"timestamp": data["timestamp"] "timestamp": data["timestamp"]
} }