Refactor all the things

This commit is contained in:
2025-04-12 14:41:04 -05:00
parent 90dc188dc8
commit 5629a92acf
5 changed files with 210 additions and 75 deletions

View File

@@ -1,22 +1,17 @@
# Domains to exclude from content reading # Domains to exclude from content reading
excluded_domains: excluded_domains:
# Local sites - "identity.getpostman.com"
- localhost - "localhost"
- 127.0.0.1 - "127.0.0.1"
- "ap.www.namecheap.com*"
# Specific Domains / Subdomains - "*.namecheap.com*"
- ap.www.namecheap.com # Ignore this specific subdomain - "us.ovhcloud.com*"
- www.namecheap.com # Ignore the main domain (will cover /twofa/* path implicitly) - "cloud.linode.com*"
- login.linode.com # Ignore the login subdomain - "*.linode.com*"
- "linode.com*"
# IP ranges (requires wildcard matching in config.py) - "*.login.*"
- 192.168.*.* - "*.auth.*"
- 10.*.*.* - "192.168.*.*"
- 172.16.*.* - "10.*.*.*"
- 0.0.0.* # Note: Be careful with overly broad patterns - "172.16.*.*"
- "0.0.0.*"
# Example wildcard patterns (requires wildcard matching in config.py)
# - *.local
# - *.githubusercontent.com
# - *.google.com # Example: Ignore all google subdomains

64
src/base_crawler.py Normal file
View File

@@ -0,0 +1,64 @@
from typing import Tuple
from urllib.parse import urlparse
from database import Database
from domain_exclusions import DomainExclusions
from logger import Logger
from crawl4ai import AsyncWebCrawler
class BaseCrawler:
def __init__(self, db: Database, domain_exclusions: DomainExclusions, logger: Logger):
self.db = db
self.domain_exclusions = domain_exclusions
self.logger = logger
self.crawler = AsyncWebCrawler()
async def __aenter__(self):
await self.crawler.__aenter__()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.crawler.__aexit__(exc_type, exc_val, exc_tb)
def should_skip_url(self, url: str) -> Tuple[bool, str]:
# Skip about: or chrome: URLs
if url.startswith("about:") or url.startswith("chrome:"):
return True, "Browser internal URL"
domain = urlparse(url).netloc
# Check domain exclusions
if self.domain_exclusions.is_excluded(domain):
return True, "Excluded domain"
# Check if URL exists
if self.db.url_exists(url):
return True, "URL already processed"
return False, ""
async def crawl_url(self, url: str, default_title: str = None) -> Tuple[bool, dict]:
try:
result = await self.crawler.arun(url=url)
crawl_result = result[0]
title = crawl_result.metadata.get('title') or default_title or url.split("/")[-1]
content = crawl_result.markdown
self.db.add_history(
url=url,
title=title,
content=content
)
return True, {
"url": url,
"title": title,
"status": "received"
}
except Exception as e:
self.logger.error(f"Error processing URL {url}: {str(e)}")
return False, {
"url": url,
"title": default_title or url.split("/")[-1],
"status": "error",
"error": str(e)
}

View File

@@ -46,6 +46,17 @@ class Database:
updated TIMESTAMP NOT NULL updated TIMESTAMP NOT NULL
) )
''') ''')
# Add index on url column
self.cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_history_url ON history(url)
''')
# Add unique index on url column
self.cursor.execute('''
CREATE UNIQUE INDEX IF NOT EXISTS idx_history_url ON history(url)
''')
self.conn.commit() self.conn.commit()
def add_history(self, url: str, title: str, content: str) -> int: def add_history(self, url: str, title: str, content: str) -> int:
@@ -104,6 +115,11 @@ class Database:
self.conn.commit() self.conn.commit()
return self.cursor.rowcount > 0 return self.cursor.rowcount > 0
def url_exists(self, url: str) -> bool:
"""Check if a URL already exists in the database."""
self.cursor.execute('SELECT 1 FROM history WHERE url = ? LIMIT 1', (url,))
return self.cursor.fetchone() is not None
def __del__(self): def __del__(self):
"""Cleanup database connection.""" """Cleanup database connection."""
if hasattr(self, 'conn'): if hasattr(self, 'conn'):

View File

@@ -2,7 +2,7 @@ import yaml
from fnmatch import fnmatch from fnmatch import fnmatch
class DomainExclusions: class DomainExclusions:
def __init__(self, config_path="history_config.yaml"): def __init__(self, config_path="config/history_config.yaml"):
self.excluded_domains = [] self.excluded_domains = []
self.load_config(config_path) self.load_config(config_path)
@@ -12,8 +12,11 @@ class DomainExclusions:
with open(config_path, 'r') as f: with open(config_path, 'r') as f:
config = yaml.safe_load(f) config = yaml.safe_load(f)
# Get the excluded_domains list from config, defaulting to empty list if not found # Handle both direct list and dict with 'excluded_domains' key
self.excluded_domains = config.get('excluded_domains', []) if isinstance(config, list):
self.excluded_domains = config
else:
self.excluded_domains = config.get('excluded_domains', [])
except FileNotFoundError: except FileNotFoundError:
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.") print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
except yaml.YAMLError as e: except yaml.YAMLError as e:
@@ -23,12 +26,30 @@ class DomainExclusions:
def is_excluded(self, domain): def is_excluded(self, domain):
""" """
Check if a domain matches any of the excluded domain patterns. Check if a domain matches any of the excluded domain patterns.
Supports wildcards (*, ?) in the excluded domain patterns.
Args:
domain (str): The domain to check
Returns:
bool: True if the domain should be excluded, False otherwise
""" """
return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains) # Strip protocol (http:// or https://) if present
domain = domain.lower().strip('/')
if '://' in domain:
domain = domain.split('://', 1)[1]
# Strip query parameters if present
if '?' in domain:
domain = domain.split('?', 1)[0]
# Split domain and path
if '/' in domain:
domain = domain.split('/', 1)[0]
for pattern in self.excluded_domains:
pattern = pattern.lower().strip('/')
if '/' in pattern:
pattern = pattern.split('/', 1)[0]
# Remove trailing wildcard if present
if pattern.endswith('*'):
pattern = pattern.rstrip('*').rstrip('.')
# Use fnmatch for proper wildcard pattern matching
if fnmatch(domain, pattern):
return True
return False

View File

@@ -1,76 +1,115 @@
from fastapi import FastAPI, WebSocket from fastapi import FastAPI, WebSocket
from starlette.websockets import WebSocketDisconnect
import uvicorn import uvicorn
from logger import Logger from logger import Logger
import os import os
from database import Database from database import Database
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler
from domain_exclusions import DomainExclusions from domain_exclusions import DomainExclusions
from urllib.parse import urlparse from base_crawler import BaseCrawler
import asyncio
from contextlib import asynccontextmanager
from browser_history import get_history
# Create logs directory if it doesn't exist # Create logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True) os.makedirs('logs', exist_ok=True)
app = FastAPI() @asynccontextmanager
async def lifespan(app: FastAPI):
# Startup
global history_crawler
logger.info("Initializing crawler and loading browser history...")
try:
# Initialize history crawler
history_crawler = HistoryCrawler(db, domain_exclusions, logger)
async with history_crawler: # Use async context manager
outputs = get_history()
history_crawler.crawl_queue = outputs.histories
logger.info(f"Loaded {len(history_crawler.crawl_queue)} URLs from browser history")
# Start the crawler in the background
task = asyncio.create_task(history_crawler.start_crawler())
yield
# Stop the crawler
history_crawler.is_running = False
await task # Wait for crawler to finish
except Exception as e:
logger.error(f"Error during startup: {str(e)}")
yield
app = FastAPI(lifespan=lifespan)
logger = Logger() logger = Logger()
db = Database() db = Database()
domain_exclusions = DomainExclusions() domain_exclusions = DomainExclusions()
class HistoryCrawler(BaseCrawler):
def __init__(self, db: Database, domain_exclusions: DomainExclusions, logger: Logger):
super().__init__(db, domain_exclusions, logger)
self.crawl_queue = []
self.is_running = True
async def start_crawler(self):
while self.is_running and self.crawl_queue:
timestamp, url, title = self.crawl_queue.pop(0)
should_skip, skip_reason = self.should_skip_url(url)
if should_skip:
self.logger.info(f"Skipping URL from history: {url} ({skip_reason})")
continue
success, result = await self.crawl_url(url, title)
if success:
self.logger.info(f"Processed historical URL: {url}")
await asyncio.sleep(30) # Wait 30 seconds before next crawl
@app.websocket("/ws") @app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket): async def websocket_endpoint(websocket: WebSocket):
await websocket.accept() await websocket.accept()
logger.info("New WebSocket connection established") logger.info("New WebSocket connection established")
async with AsyncWebCrawler() as crawler: ws_crawler = BaseCrawler(db, domain_exclusions, logger)
try:
while True:
data = await websocket.receive_json()
url = data["url"]
domain = urlparse(url).netloc
# Check if domain is excluded try:
if domain_exclusions.is_excluded(domain): while True:
logger.info(f"Skipping excluded domain: {domain}") data = await websocket.receive_json()
await websocket.send_json({ url = data["url"]
"status": "skipped",
"data": {
"url": url,
"title": "Excluded Domain",
"timestamp": data["timestamp"]
}
})
continue
try: should_skip, skip_reason = ws_crawler.should_skip_url(url)
result = await crawler.arun(url=url) if should_skip:
crawl_result = result[0] logger.info(f"Skipping URL: {url} ({skip_reason})")
title = crawl_result.metadata.get('title') or url.split("/")[-1]
content = crawl_result.markdown
except Exception as crawl_error:
logger.error(f"Crawling error for {url}: {str(crawl_error)}")
title = url.split("/")[-1]
content = str(data)
db.add_history(
url=url,
title=title,
content=content
)
logger.info(f"Processed URL: {url} - {title}")
await websocket.send_json({ await websocket.send_json({
"status": "received", "status": "skipped",
"data": { "data": {
"url": url, "url": url,
"title": title, "title": skip_reason,
"timestamp": data["timestamp"] "timestamp": data["timestamp"]
} }
}) })
except Exception as e: continue
logger.error(f"WebSocket error: {str(e)}")
success, result = await ws_crawler.crawl_url(url)
await websocket.send_json({
"status": result["status"],
"data": {
"url": result["url"],
"title": result["title"],
"timestamp": data["timestamp"]
}
})
except WebSocketDisconnect:
logger.info("WebSocket connection closed by client")
except Exception as e:
logger.error(f"WebSocket error: {str(e)}")
try:
await websocket.close() await websocket.close()
finally: except RuntimeError:
logger.info("WebSocket connection closed") # Connection might already be closed
pass
finally:
logger.info("WebSocket connection closed")
if __name__ == "__main__": if __name__ == "__main__":
logger.info("Starting WebSocket server...") logger.info("Starting WebSocket server...")