mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
Move files, add domain exclusion
This commit is contained in:
34
src/domain_exclusions.py
Normal file
34
src/domain_exclusions.py
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
import yaml
|
||||||
|
from fnmatch import fnmatch
|
||||||
|
|
||||||
|
class DomainExclusions:
|
||||||
|
def __init__(self, config_path="history_config.yaml"):
|
||||||
|
self.excluded_domains = []
|
||||||
|
self.load_config(config_path)
|
||||||
|
|
||||||
|
def load_config(self, config_path):
|
||||||
|
"""Load excluded domains from the YAML configuration file."""
|
||||||
|
try:
|
||||||
|
with open(config_path, 'r') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
# Get the excluded_domains list from config, defaulting to empty list if not found
|
||||||
|
self.excluded_domains = config.get('excluded_domains', [])
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
|
||||||
|
except yaml.YAMLError as e:
|
||||||
|
print(f"Error parsing YAML configuration: {e}")
|
||||||
|
self.excluded_domains = []
|
||||||
|
|
||||||
|
def is_excluded(self, domain):
|
||||||
|
"""
|
||||||
|
Check if a domain matches any of the excluded domain patterns.
|
||||||
|
Supports wildcards (*, ?) in the excluded domain patterns.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
domain (str): The domain to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the domain should be excluded, False otherwise
|
||||||
|
"""
|
||||||
|
return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains)
|
||||||
@@ -4,6 +4,8 @@ from logger import Logger
|
|||||||
import os
|
import os
|
||||||
from database import Database
|
from database import Database
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
from domain_exclusions import DomainExclusions
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
# Create logs directory if it doesn't exist
|
# Create logs directory if it doesn't exist
|
||||||
os.makedirs('logs', exist_ok=True)
|
os.makedirs('logs', exist_ok=True)
|
||||||
@@ -12,6 +14,7 @@ app = FastAPI()
|
|||||||
logger = Logger()
|
logger = Logger()
|
||||||
|
|
||||||
db = Database()
|
db = Database()
|
||||||
|
domain_exclusions = DomainExclusions() # Initialize with default config path
|
||||||
|
|
||||||
@app.websocket("/ws")
|
@app.websocket("/ws")
|
||||||
async def websocket_endpoint(websocket: WebSocket):
|
async def websocket_endpoint(websocket: WebSocket):
|
||||||
@@ -23,32 +26,46 @@ async def websocket_endpoint(websocket: WebSocket):
|
|||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
data = await websocket.receive_json()
|
data = await websocket.receive_json()
|
||||||
|
url = data["url"]
|
||||||
|
domain = urlparse(url).netloc
|
||||||
|
|
||||||
|
# Check if domain is excluded
|
||||||
|
if domain_exclusions.is_excluded(domain):
|
||||||
|
logger.info(f"Skipping excluded domain: {domain}")
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "skipped",
|
||||||
|
"data": {
|
||||||
|
"url": url,
|
||||||
|
"title": "Excluded Domain",
|
||||||
|
"timestamp": data["timestamp"]
|
||||||
|
}
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
# Crawl the URL to get title and content
|
# Crawl the URL to get title and content
|
||||||
try:
|
try:
|
||||||
result = await crawler.arun(url=data["url"])
|
result = await crawler.arun(url=url)
|
||||||
# Get the first result from the container and access metadata
|
# Get the first result from the container and access metadata
|
||||||
crawl_result = result[0]
|
crawl_result = result[0]
|
||||||
title = crawl_result.metadata.get('title') or data["url"].split("/")[-1]
|
title = crawl_result.metadata.get('title') or url.split("/")[-1]
|
||||||
content = crawl_result.markdown
|
content = crawl_result.markdown
|
||||||
logger.info(f"Crawling result: {result}")
|
|
||||||
except Exception as crawl_error:
|
except Exception as crawl_error:
|
||||||
logger.error(f"Crawling error for {data['url']}: {str(crawl_error)}")
|
logger.error(f"Crawling error for {url}: {str(crawl_error)}")
|
||||||
title = data["url"].split("/")[-1]
|
title = url.split("/")[-1]
|
||||||
content = str(data)
|
content = str(data)
|
||||||
|
|
||||||
# Store received data with crawled information
|
# Store received data with crawled information
|
||||||
db.add_history(
|
db.add_history(
|
||||||
url=data["url"],
|
url=url,
|
||||||
title=title,
|
title=title,
|
||||||
content=content
|
content=content
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Processed URL: {data['url']} - {title}")
|
logger.info(f"Processed URL: {url} - {title}")
|
||||||
await websocket.send_json({
|
await websocket.send_json({
|
||||||
"status": "received",
|
"status": "received",
|
||||||
"data": {
|
"data": {
|
||||||
"url": data["url"],
|
"url": url,
|
||||||
"title": title,
|
"title": title,
|
||||||
"timestamp": data["timestamp"]
|
"timestamp": data["timestamp"]
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user