mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
Refactor all the things
This commit is contained in:
@@ -1,22 +1,17 @@
|
|||||||
# Domains to exclude from content reading
|
# Domains to exclude from content reading
|
||||||
excluded_domains:
|
excluded_domains:
|
||||||
# Local sites
|
- "identity.getpostman.com"
|
||||||
- localhost
|
- "localhost"
|
||||||
- 127.0.0.1
|
- "127.0.0.1"
|
||||||
|
- "ap.www.namecheap.com*"
|
||||||
# Specific Domains / Subdomains
|
- "*.namecheap.com*"
|
||||||
- ap.www.namecheap.com # Ignore this specific subdomain
|
- "us.ovhcloud.com*"
|
||||||
- www.namecheap.com # Ignore the main domain (will cover /twofa/* path implicitly)
|
- "cloud.linode.com*"
|
||||||
- login.linode.com # Ignore the login subdomain
|
- "*.linode.com*"
|
||||||
|
- "linode.com*"
|
||||||
# IP ranges (requires wildcard matching in config.py)
|
- "*.login.*"
|
||||||
- 192.168.*.*
|
- "*.auth.*"
|
||||||
- 10.*.*.*
|
- "192.168.*.*"
|
||||||
- 172.16.*.*
|
- "10.*.*.*"
|
||||||
- 0.0.0.* # Note: Be careful with overly broad patterns
|
- "172.16.*.*"
|
||||||
|
- "0.0.0.*"
|
||||||
|
|
||||||
# Example wildcard patterns (requires wildcard matching in config.py)
|
|
||||||
# - *.local
|
|
||||||
# - *.githubusercontent.com
|
|
||||||
# - *.google.com # Example: Ignore all google subdomains
|
|
||||||
|
|||||||
64
src/base_crawler.py
Normal file
64
src/base_crawler.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
from typing import Tuple
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from database import Database
|
||||||
|
from domain_exclusions import DomainExclusions
|
||||||
|
from logger import Logger
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
class BaseCrawler:
|
||||||
|
def __init__(self, db: Database, domain_exclusions: DomainExclusions, logger: Logger):
|
||||||
|
self.db = db
|
||||||
|
self.domain_exclusions = domain_exclusions
|
||||||
|
self.logger = logger
|
||||||
|
self.crawler = AsyncWebCrawler()
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
await self.crawler.__aenter__()
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
await self.crawler.__aexit__(exc_type, exc_val, exc_tb)
|
||||||
|
|
||||||
|
def should_skip_url(self, url: str) -> Tuple[bool, str]:
|
||||||
|
# Skip about: or chrome: URLs
|
||||||
|
if url.startswith("about:") or url.startswith("chrome:"):
|
||||||
|
return True, "Browser internal URL"
|
||||||
|
|
||||||
|
domain = urlparse(url).netloc
|
||||||
|
|
||||||
|
# Check domain exclusions
|
||||||
|
if self.domain_exclusions.is_excluded(domain):
|
||||||
|
return True, "Excluded domain"
|
||||||
|
|
||||||
|
# Check if URL exists
|
||||||
|
if self.db.url_exists(url):
|
||||||
|
return True, "URL already processed"
|
||||||
|
|
||||||
|
return False, ""
|
||||||
|
|
||||||
|
async def crawl_url(self, url: str, default_title: str = None) -> Tuple[bool, dict]:
|
||||||
|
try:
|
||||||
|
result = await self.crawler.arun(url=url)
|
||||||
|
crawl_result = result[0]
|
||||||
|
title = crawl_result.metadata.get('title') or default_title or url.split("/")[-1]
|
||||||
|
content = crawl_result.markdown
|
||||||
|
|
||||||
|
self.db.add_history(
|
||||||
|
url=url,
|
||||||
|
title=title,
|
||||||
|
content=content
|
||||||
|
)
|
||||||
|
|
||||||
|
return True, {
|
||||||
|
"url": url,
|
||||||
|
"title": title,
|
||||||
|
"status": "received"
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error processing URL {url}: {str(e)}")
|
||||||
|
return False, {
|
||||||
|
"url": url,
|
||||||
|
"title": default_title or url.split("/")[-1],
|
||||||
|
"status": "error",
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
@@ -46,6 +46,17 @@ class Database:
|
|||||||
updated TIMESTAMP NOT NULL
|
updated TIMESTAMP NOT NULL
|
||||||
)
|
)
|
||||||
''')
|
''')
|
||||||
|
|
||||||
|
# Add index on url column
|
||||||
|
self.cursor.execute('''
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_history_url ON history(url)
|
||||||
|
''')
|
||||||
|
|
||||||
|
# Add unique index on url column
|
||||||
|
self.cursor.execute('''
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_history_url ON history(url)
|
||||||
|
''')
|
||||||
|
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
|
|
||||||
def add_history(self, url: str, title: str, content: str) -> int:
|
def add_history(self, url: str, title: str, content: str) -> int:
|
||||||
@@ -104,6 +115,11 @@ class Database:
|
|||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
return self.cursor.rowcount > 0
|
return self.cursor.rowcount > 0
|
||||||
|
|
||||||
|
def url_exists(self, url: str) -> bool:
|
||||||
|
"""Check if a URL already exists in the database."""
|
||||||
|
self.cursor.execute('SELECT 1 FROM history WHERE url = ? LIMIT 1', (url,))
|
||||||
|
return self.cursor.fetchone() is not None
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
"""Cleanup database connection."""
|
"""Cleanup database connection."""
|
||||||
if hasattr(self, 'conn'):
|
if hasattr(self, 'conn'):
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import yaml
|
|||||||
from fnmatch import fnmatch
|
from fnmatch import fnmatch
|
||||||
|
|
||||||
class DomainExclusions:
|
class DomainExclusions:
|
||||||
def __init__(self, config_path="history_config.yaml"):
|
def __init__(self, config_path="config/history_config.yaml"):
|
||||||
self.excluded_domains = []
|
self.excluded_domains = []
|
||||||
self.load_config(config_path)
|
self.load_config(config_path)
|
||||||
|
|
||||||
@@ -12,8 +12,11 @@ class DomainExclusions:
|
|||||||
with open(config_path, 'r') as f:
|
with open(config_path, 'r') as f:
|
||||||
config = yaml.safe_load(f)
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
# Get the excluded_domains list from config, defaulting to empty list if not found
|
# Handle both direct list and dict with 'excluded_domains' key
|
||||||
self.excluded_domains = config.get('excluded_domains', [])
|
if isinstance(config, list):
|
||||||
|
self.excluded_domains = config
|
||||||
|
else:
|
||||||
|
self.excluded_domains = config.get('excluded_domains', [])
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
|
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
|
||||||
except yaml.YAMLError as e:
|
except yaml.YAMLError as e:
|
||||||
@@ -23,12 +26,30 @@ class DomainExclusions:
|
|||||||
def is_excluded(self, domain):
|
def is_excluded(self, domain):
|
||||||
"""
|
"""
|
||||||
Check if a domain matches any of the excluded domain patterns.
|
Check if a domain matches any of the excluded domain patterns.
|
||||||
Supports wildcards (*, ?) in the excluded domain patterns.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
domain (str): The domain to check
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: True if the domain should be excluded, False otherwise
|
|
||||||
"""
|
"""
|
||||||
return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains)
|
# Strip protocol (http:// or https://) if present
|
||||||
|
domain = domain.lower().strip('/')
|
||||||
|
if '://' in domain:
|
||||||
|
domain = domain.split('://', 1)[1]
|
||||||
|
|
||||||
|
# Strip query parameters if present
|
||||||
|
if '?' in domain:
|
||||||
|
domain = domain.split('?', 1)[0]
|
||||||
|
|
||||||
|
# Split domain and path
|
||||||
|
if '/' in domain:
|
||||||
|
domain = domain.split('/', 1)[0]
|
||||||
|
|
||||||
|
for pattern in self.excluded_domains:
|
||||||
|
pattern = pattern.lower().strip('/')
|
||||||
|
if '/' in pattern:
|
||||||
|
pattern = pattern.split('/', 1)[0]
|
||||||
|
|
||||||
|
# Remove trailing wildcard if present
|
||||||
|
if pattern.endswith('*'):
|
||||||
|
pattern = pattern.rstrip('*').rstrip('.')
|
||||||
|
|
||||||
|
# Use fnmatch for proper wildcard pattern matching
|
||||||
|
if fnmatch(domain, pattern):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
127
src/main.py
127
src/main.py
@@ -1,76 +1,115 @@
|
|||||||
from fastapi import FastAPI, WebSocket
|
from fastapi import FastAPI, WebSocket
|
||||||
|
from starlette.websockets import WebSocketDisconnect
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from logger import Logger
|
from logger import Logger
|
||||||
import os
|
import os
|
||||||
from database import Database
|
from database import Database
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler
|
||||||
from domain_exclusions import DomainExclusions
|
from domain_exclusions import DomainExclusions
|
||||||
from urllib.parse import urlparse
|
from base_crawler import BaseCrawler
|
||||||
|
import asyncio
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
from browser_history import get_history
|
||||||
# Create logs directory if it doesn't exist
|
# Create logs directory if it doesn't exist
|
||||||
os.makedirs('logs', exist_ok=True)
|
os.makedirs('logs', exist_ok=True)
|
||||||
|
|
||||||
app = FastAPI()
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
# Startup
|
||||||
|
global history_crawler
|
||||||
|
logger.info("Initializing crawler and loading browser history...")
|
||||||
|
try:
|
||||||
|
# Initialize history crawler
|
||||||
|
history_crawler = HistoryCrawler(db, domain_exclusions, logger)
|
||||||
|
async with history_crawler: # Use async context manager
|
||||||
|
outputs = get_history()
|
||||||
|
history_crawler.crawl_queue = outputs.histories
|
||||||
|
logger.info(f"Loaded {len(history_crawler.crawl_queue)} URLs from browser history")
|
||||||
|
|
||||||
|
# Start the crawler in the background
|
||||||
|
task = asyncio.create_task(history_crawler.start_crawler())
|
||||||
|
yield
|
||||||
|
# Stop the crawler
|
||||||
|
history_crawler.is_running = False
|
||||||
|
await task # Wait for crawler to finish
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during startup: {str(e)}")
|
||||||
|
yield
|
||||||
|
|
||||||
|
app = FastAPI(lifespan=lifespan)
|
||||||
logger = Logger()
|
logger = Logger()
|
||||||
|
|
||||||
db = Database()
|
db = Database()
|
||||||
domain_exclusions = DomainExclusions()
|
domain_exclusions = DomainExclusions()
|
||||||
|
|
||||||
|
class HistoryCrawler(BaseCrawler):
|
||||||
|
def __init__(self, db: Database, domain_exclusions: DomainExclusions, logger: Logger):
|
||||||
|
super().__init__(db, domain_exclusions, logger)
|
||||||
|
self.crawl_queue = []
|
||||||
|
self.is_running = True
|
||||||
|
|
||||||
|
async def start_crawler(self):
|
||||||
|
while self.is_running and self.crawl_queue:
|
||||||
|
timestamp, url, title = self.crawl_queue.pop(0)
|
||||||
|
|
||||||
|
should_skip, skip_reason = self.should_skip_url(url)
|
||||||
|
if should_skip:
|
||||||
|
self.logger.info(f"Skipping URL from history: {url} ({skip_reason})")
|
||||||
|
continue
|
||||||
|
|
||||||
|
success, result = await self.crawl_url(url, title)
|
||||||
|
if success:
|
||||||
|
self.logger.info(f"Processed historical URL: {url}")
|
||||||
|
|
||||||
|
await asyncio.sleep(30) # Wait 30 seconds before next crawl
|
||||||
|
|
||||||
@app.websocket("/ws")
|
@app.websocket("/ws")
|
||||||
async def websocket_endpoint(websocket: WebSocket):
|
async def websocket_endpoint(websocket: WebSocket):
|
||||||
await websocket.accept()
|
await websocket.accept()
|
||||||
logger.info("New WebSocket connection established")
|
logger.info("New WebSocket connection established")
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
ws_crawler = BaseCrawler(db, domain_exclusions, logger)
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
data = await websocket.receive_json()
|
|
||||||
url = data["url"]
|
|
||||||
domain = urlparse(url).netloc
|
|
||||||
|
|
||||||
# Check if domain is excluded
|
try:
|
||||||
if domain_exclusions.is_excluded(domain):
|
while True:
|
||||||
logger.info(f"Skipping excluded domain: {domain}")
|
data = await websocket.receive_json()
|
||||||
await websocket.send_json({
|
url = data["url"]
|
||||||
"status": "skipped",
|
|
||||||
"data": {
|
|
||||||
"url": url,
|
|
||||||
"title": "Excluded Domain",
|
|
||||||
"timestamp": data["timestamp"]
|
|
||||||
}
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
should_skip, skip_reason = ws_crawler.should_skip_url(url)
|
||||||
result = await crawler.arun(url=url)
|
if should_skip:
|
||||||
crawl_result = result[0]
|
logger.info(f"Skipping URL: {url} ({skip_reason})")
|
||||||
title = crawl_result.metadata.get('title') or url.split("/")[-1]
|
|
||||||
content = crawl_result.markdown
|
|
||||||
except Exception as crawl_error:
|
|
||||||
logger.error(f"Crawling error for {url}: {str(crawl_error)}")
|
|
||||||
title = url.split("/")[-1]
|
|
||||||
content = str(data)
|
|
||||||
|
|
||||||
db.add_history(
|
|
||||||
url=url,
|
|
||||||
title=title,
|
|
||||||
content=content
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(f"Processed URL: {url} - {title}")
|
|
||||||
await websocket.send_json({
|
await websocket.send_json({
|
||||||
"status": "received",
|
"status": "skipped",
|
||||||
"data": {
|
"data": {
|
||||||
"url": url,
|
"url": url,
|
||||||
"title": title,
|
"title": skip_reason,
|
||||||
"timestamp": data["timestamp"]
|
"timestamp": data["timestamp"]
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
except Exception as e:
|
continue
|
||||||
logger.error(f"WebSocket error: {str(e)}")
|
|
||||||
|
success, result = await ws_crawler.crawl_url(url)
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": result["status"],
|
||||||
|
"data": {
|
||||||
|
"url": result["url"],
|
||||||
|
"title": result["title"],
|
||||||
|
"timestamp": data["timestamp"]
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
except WebSocketDisconnect:
|
||||||
|
logger.info("WebSocket connection closed by client")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"WebSocket error: {str(e)}")
|
||||||
|
try:
|
||||||
await websocket.close()
|
await websocket.close()
|
||||||
finally:
|
except RuntimeError:
|
||||||
logger.info("WebSocket connection closed")
|
# Connection might already be closed
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
logger.info("WebSocket connection closed")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logger.info("Starting WebSocket server...")
|
logger.info("Starting WebSocket server...")
|
||||||
|
|||||||
Reference in New Issue
Block a user