mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 10:29:38 +00:00
Move files, add domain exclusion
This commit is contained in:
110
src/database.py
Normal file
110
src/database.py
Normal file
@@ -0,0 +1,110 @@
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict
|
||||
import threading
|
||||
|
||||
class Database:
|
||||
_instance = None
|
||||
_lock = threading.Lock()
|
||||
|
||||
def __new__(cls):
|
||||
with cls._lock:
|
||||
if cls._instance is None:
|
||||
cls._instance = super(Database, cls).__new__(cls)
|
||||
cls._instance._initialize_db()
|
||||
return cls._instance
|
||||
|
||||
def _initialize_db(self):
|
||||
"""Initialize the database connection and create tables if they don't exist."""
|
||||
self.conn = sqlite3.connect('history.db', check_same_thread=False)
|
||||
self.conn.row_factory = sqlite3.Row
|
||||
|
||||
try:
|
||||
# Set WAL mode first, before any other operations
|
||||
self.conn.execute('PRAGMA journal_mode=WAL')
|
||||
|
||||
# Other performance and reliability optimizations
|
||||
self.conn.execute('PRAGMA synchronous=NORMAL') # Balance between safety and speed
|
||||
self.conn.execute('PRAGMA temp_store=MEMORY') # Store temp tables and indices in memory
|
||||
self.conn.execute('PRAGMA cache_size=-64000') # Use 64MB of memory for page cache
|
||||
self.conn.execute('PRAGMA foreign_keys=ON') # Enable foreign key constraints
|
||||
except Exception as e:
|
||||
print(f"Error setting database PRAGMA options: {e}")
|
||||
# Optionally re-raise the exception if you want to halt execution
|
||||
raise
|
||||
|
||||
self.cursor = self.conn.cursor()
|
||||
|
||||
# Create history table
|
||||
self.cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS history (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT NOT NULL,
|
||||
title TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
created TIMESTAMP NOT NULL,
|
||||
updated TIMESTAMP NOT NULL
|
||||
)
|
||||
''')
|
||||
self.conn.commit()
|
||||
|
||||
def add_history(self, url: str, title: str, content: str) -> int:
|
||||
"""Add a new history entry."""
|
||||
now = datetime.utcnow()
|
||||
with self._lock:
|
||||
self.cursor.execute('''
|
||||
INSERT INTO history (url, title, content, created, updated)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
''', (url, title, content, now, now))
|
||||
self.conn.commit()
|
||||
return self.cursor.lastrowid
|
||||
|
||||
def get_history(self, limit: int = 100) -> List[Dict]:
|
||||
"""Get history entries, ordered by most recent first."""
|
||||
self.cursor.execute('''
|
||||
SELECT * FROM history
|
||||
ORDER BY created DESC
|
||||
LIMIT ?
|
||||
''', (limit,))
|
||||
return [dict(row) for row in self.cursor.fetchall()]
|
||||
|
||||
def update_history(self, id: int, title: Optional[str] = None,
|
||||
content: Optional[str] = None) -> bool:
|
||||
"""Update an existing history entry."""
|
||||
update_fields = []
|
||||
values = []
|
||||
|
||||
if title is not None:
|
||||
update_fields.append("title = ?")
|
||||
values.append(title)
|
||||
if content is not None:
|
||||
update_fields.append("content = ?")
|
||||
values.append(content)
|
||||
|
||||
if not update_fields:
|
||||
return False
|
||||
|
||||
update_fields.append("updated = ?")
|
||||
values.append(datetime.utcnow())
|
||||
values.append(id)
|
||||
|
||||
with self._lock:
|
||||
self.cursor.execute(f'''
|
||||
UPDATE history
|
||||
SET {", ".join(update_fields)}
|
||||
WHERE id = ?
|
||||
''', values)
|
||||
self.conn.commit()
|
||||
return self.cursor.rowcount > 0
|
||||
|
||||
def delete_history(self, id: int) -> bool:
|
||||
"""Delete a history entry."""
|
||||
with self._lock:
|
||||
self.cursor.execute('DELETE FROM history WHERE id = ?', (id,))
|
||||
self.conn.commit()
|
||||
return self.cursor.rowcount > 0
|
||||
|
||||
def __del__(self):
|
||||
"""Cleanup database connection."""
|
||||
if hasattr(self, 'conn'):
|
||||
self.conn.close()
|
||||
34
src/domain_exclusions.py
Normal file
34
src/domain_exclusions.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import yaml
|
||||
from fnmatch import fnmatch
|
||||
|
||||
class DomainExclusions:
|
||||
def __init__(self, config_path="history_config.yaml"):
|
||||
self.excluded_domains = []
|
||||
self.load_config(config_path)
|
||||
|
||||
def load_config(self, config_path):
|
||||
"""Load excluded domains from the YAML configuration file."""
|
||||
try:
|
||||
with open(config_path, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# Get the excluded_domains list from config, defaulting to empty list if not found
|
||||
self.excluded_domains = config.get('excluded_domains', [])
|
||||
except FileNotFoundError:
|
||||
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
|
||||
except yaml.YAMLError as e:
|
||||
print(f"Error parsing YAML configuration: {e}")
|
||||
self.excluded_domains = []
|
||||
|
||||
def is_excluded(self, domain):
|
||||
"""
|
||||
Check if a domain matches any of the excluded domain patterns.
|
||||
Supports wildcards (*, ?) in the excluded domain patterns.
|
||||
|
||||
Args:
|
||||
domain (str): The domain to check
|
||||
|
||||
Returns:
|
||||
bool: True if the domain should be excluded, False otherwise
|
||||
"""
|
||||
return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains)
|
||||
36
src/logger.py
Normal file
36
src/logger.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
class Logger:
|
||||
_instance: Optional['Logger'] = None
|
||||
|
||||
def __new__(cls):
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
cls._instance._initialize()
|
||||
return cls._instance
|
||||
|
||||
def _initialize(self):
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(f'logs/main_{datetime.now().strftime("%Y%m%d")}.log'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def info(self, message: str):
|
||||
self.logger.info(message)
|
||||
|
||||
def error(self, message: str):
|
||||
self.logger.error(message)
|
||||
|
||||
def warning(self, message: str):
|
||||
self.logger.warning(message)
|
||||
|
||||
def debug(self, message: str):
|
||||
self.logger.debug(message)
|
||||
81
src/main.py
Normal file
81
src/main.py
Normal file
@@ -0,0 +1,81 @@
|
||||
from fastapi import FastAPI, WebSocket
|
||||
import uvicorn
|
||||
from logger import Logger
|
||||
import os
|
||||
from database import Database
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from domain_exclusions import DomainExclusions
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Create logs directory if it doesn't exist
|
||||
os.makedirs('logs', exist_ok=True)
|
||||
|
||||
app = FastAPI()
|
||||
logger = Logger()
|
||||
|
||||
db = Database()
|
||||
domain_exclusions = DomainExclusions() # Initialize with default config path
|
||||
|
||||
@app.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
await websocket.accept()
|
||||
logger.info("New WebSocket connection established")
|
||||
|
||||
# Create crawler instance outside the loop for reuse
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
try:
|
||||
while True:
|
||||
data = await websocket.receive_json()
|
||||
url = data["url"]
|
||||
domain = urlparse(url).netloc
|
||||
|
||||
# Check if domain is excluded
|
||||
if domain_exclusions.is_excluded(domain):
|
||||
logger.info(f"Skipping excluded domain: {domain}")
|
||||
await websocket.send_json({
|
||||
"status": "skipped",
|
||||
"data": {
|
||||
"url": url,
|
||||
"title": "Excluded Domain",
|
||||
"timestamp": data["timestamp"]
|
||||
}
|
||||
})
|
||||
continue
|
||||
|
||||
# Crawl the URL to get title and content
|
||||
try:
|
||||
result = await crawler.arun(url=url)
|
||||
# Get the first result from the container and access metadata
|
||||
crawl_result = result[0]
|
||||
title = crawl_result.metadata.get('title') or url.split("/")[-1]
|
||||
content = crawl_result.markdown
|
||||
except Exception as crawl_error:
|
||||
logger.error(f"Crawling error for {url}: {str(crawl_error)}")
|
||||
title = url.split("/")[-1]
|
||||
content = str(data)
|
||||
|
||||
# Store received data with crawled information
|
||||
db.add_history(
|
||||
url=url,
|
||||
title=title,
|
||||
content=content
|
||||
)
|
||||
|
||||
logger.info(f"Processed URL: {url} - {title}")
|
||||
await websocket.send_json({
|
||||
"status": "received",
|
||||
"data": {
|
||||
"url": url,
|
||||
"title": title,
|
||||
"timestamp": data["timestamp"]
|
||||
}
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"WebSocket error: {str(e)}")
|
||||
await websocket.close()
|
||||
finally:
|
||||
logger.info("WebSocket connection closed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("Starting WebSocket server...")
|
||||
uvicorn.run(app, host="0.0.0.0", port=8523)
|
||||
Reference in New Issue
Block a user