mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
Move files, add domain exclusion
This commit is contained in:
34
src/domain_exclusions.py
Normal file
34
src/domain_exclusions.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import yaml
|
||||
from fnmatch import fnmatch
|
||||
|
||||
class DomainExclusions:
|
||||
def __init__(self, config_path="history_config.yaml"):
|
||||
self.excluded_domains = []
|
||||
self.load_config(config_path)
|
||||
|
||||
def load_config(self, config_path):
|
||||
"""Load excluded domains from the YAML configuration file."""
|
||||
try:
|
||||
with open(config_path, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# Get the excluded_domains list from config, defaulting to empty list if not found
|
||||
self.excluded_domains = config.get('excluded_domains', [])
|
||||
except FileNotFoundError:
|
||||
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
|
||||
except yaml.YAMLError as e:
|
||||
print(f"Error parsing YAML configuration: {e}")
|
||||
self.excluded_domains = []
|
||||
|
||||
def is_excluded(self, domain):
|
||||
"""
|
||||
Check if a domain matches any of the excluded domain patterns.
|
||||
Supports wildcards (*, ?) in the excluded domain patterns.
|
||||
|
||||
Args:
|
||||
domain (str): The domain to check
|
||||
|
||||
Returns:
|
||||
bool: True if the domain should be excluded, False otherwise
|
||||
"""
|
||||
return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains)
|
||||
@@ -4,6 +4,8 @@ from logger import Logger
|
||||
import os
|
||||
from database import Database
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from domain_exclusions import DomainExclusions
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Create logs directory if it doesn't exist
|
||||
os.makedirs('logs', exist_ok=True)
|
||||
@@ -12,6 +14,7 @@ app = FastAPI()
|
||||
logger = Logger()
|
||||
|
||||
db = Database()
|
||||
domain_exclusions = DomainExclusions() # Initialize with default config path
|
||||
|
||||
@app.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
@@ -23,32 +26,46 @@ async def websocket_endpoint(websocket: WebSocket):
|
||||
try:
|
||||
while True:
|
||||
data = await websocket.receive_json()
|
||||
url = data["url"]
|
||||
domain = urlparse(url).netloc
|
||||
|
||||
# Check if domain is excluded
|
||||
if domain_exclusions.is_excluded(domain):
|
||||
logger.info(f"Skipping excluded domain: {domain}")
|
||||
await websocket.send_json({
|
||||
"status": "skipped",
|
||||
"data": {
|
||||
"url": url,
|
||||
"title": "Excluded Domain",
|
||||
"timestamp": data["timestamp"]
|
||||
}
|
||||
})
|
||||
continue
|
||||
|
||||
# Crawl the URL to get title and content
|
||||
try:
|
||||
result = await crawler.arun(url=data["url"])
|
||||
result = await crawler.arun(url=url)
|
||||
# Get the first result from the container and access metadata
|
||||
crawl_result = result[0]
|
||||
title = crawl_result.metadata.get('title') or data["url"].split("/")[-1]
|
||||
title = crawl_result.metadata.get('title') or url.split("/")[-1]
|
||||
content = crawl_result.markdown
|
||||
logger.info(f"Crawling result: {result}")
|
||||
except Exception as crawl_error:
|
||||
logger.error(f"Crawling error for {data['url']}: {str(crawl_error)}")
|
||||
title = data["url"].split("/")[-1]
|
||||
logger.error(f"Crawling error for {url}: {str(crawl_error)}")
|
||||
title = url.split("/")[-1]
|
||||
content = str(data)
|
||||
|
||||
# Store received data with crawled information
|
||||
db.add_history(
|
||||
url=data["url"],
|
||||
url=url,
|
||||
title=title,
|
||||
content=content
|
||||
)
|
||||
|
||||
logger.info(f"Processed URL: {data['url']} - {title}")
|
||||
logger.info(f"Processed URL: {url} - {title}")
|
||||
await websocket.send_json({
|
||||
"status": "received",
|
||||
"data": {
|
||||
"url": data["url"],
|
||||
"url": url,
|
||||
"title": title,
|
||||
"timestamp": data["timestamp"]
|
||||
}
|
||||
Reference in New Issue
Block a user