Move files, add domain exclusion

This commit is contained in:
2025-04-12 12:53:48 -05:00
parent cac52687c8
commit 1cf86dd48b
5 changed files with 59 additions and 8 deletions

110
src/database.py Normal file
View File

@@ -0,0 +1,110 @@
import sqlite3
from datetime import datetime
from typing import Optional, List, Dict
import threading
class Database:
_instance = None
_lock = threading.Lock()
def __new__(cls):
with cls._lock:
if cls._instance is None:
cls._instance = super(Database, cls).__new__(cls)
cls._instance._initialize_db()
return cls._instance
def _initialize_db(self):
"""Initialize the database connection and create tables if they don't exist."""
self.conn = sqlite3.connect('history.db', check_same_thread=False)
self.conn.row_factory = sqlite3.Row
try:
# Set WAL mode first, before any other operations
self.conn.execute('PRAGMA journal_mode=WAL')
# Other performance and reliability optimizations
self.conn.execute('PRAGMA synchronous=NORMAL') # Balance between safety and speed
self.conn.execute('PRAGMA temp_store=MEMORY') # Store temp tables and indices in memory
self.conn.execute('PRAGMA cache_size=-64000') # Use 64MB of memory for page cache
self.conn.execute('PRAGMA foreign_keys=ON') # Enable foreign key constraints
except Exception as e:
print(f"Error setting database PRAGMA options: {e}")
# Optionally re-raise the exception if you want to halt execution
raise
self.cursor = self.conn.cursor()
# Create history table
self.cursor.execute('''
CREATE TABLE IF NOT EXISTS history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL,
title TEXT NOT NULL,
content TEXT NOT NULL,
created TIMESTAMP NOT NULL,
updated TIMESTAMP NOT NULL
)
''')
self.conn.commit()
def add_history(self, url: str, title: str, content: str) -> int:
"""Add a new history entry."""
now = datetime.utcnow()
with self._lock:
self.cursor.execute('''
INSERT INTO history (url, title, content, created, updated)
VALUES (?, ?, ?, ?, ?)
''', (url, title, content, now, now))
self.conn.commit()
return self.cursor.lastrowid
def get_history(self, limit: int = 100) -> List[Dict]:
"""Get history entries, ordered by most recent first."""
self.cursor.execute('''
SELECT * FROM history
ORDER BY created DESC
LIMIT ?
''', (limit,))
return [dict(row) for row in self.cursor.fetchall()]
def update_history(self, id: int, title: Optional[str] = None,
content: Optional[str] = None) -> bool:
"""Update an existing history entry."""
update_fields = []
values = []
if title is not None:
update_fields.append("title = ?")
values.append(title)
if content is not None:
update_fields.append("content = ?")
values.append(content)
if not update_fields:
return False
update_fields.append("updated = ?")
values.append(datetime.utcnow())
values.append(id)
with self._lock:
self.cursor.execute(f'''
UPDATE history
SET {", ".join(update_fields)}
WHERE id = ?
''', values)
self.conn.commit()
return self.cursor.rowcount > 0
def delete_history(self, id: int) -> bool:
"""Delete a history entry."""
with self._lock:
self.cursor.execute('DELETE FROM history WHERE id = ?', (id,))
self.conn.commit()
return self.cursor.rowcount > 0
def __del__(self):
"""Cleanup database connection."""
if hasattr(self, 'conn'):
self.conn.close()

34
src/domain_exclusions.py Normal file
View File

@@ -0,0 +1,34 @@
import yaml
from fnmatch import fnmatch
class DomainExclusions:
def __init__(self, config_path="history_config.yaml"):
self.excluded_domains = []
self.load_config(config_path)
def load_config(self, config_path):
"""Load excluded domains from the YAML configuration file."""
try:
with open(config_path, 'r') as f:
config = yaml.safe_load(f)
# Get the excluded_domains list from config, defaulting to empty list if not found
self.excluded_domains = config.get('excluded_domains', [])
except FileNotFoundError:
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
except yaml.YAMLError as e:
print(f"Error parsing YAML configuration: {e}")
self.excluded_domains = []
def is_excluded(self, domain):
"""
Check if a domain matches any of the excluded domain patterns.
Supports wildcards (*, ?) in the excluded domain patterns.
Args:
domain (str): The domain to check
Returns:
bool: True if the domain should be excluded, False otherwise
"""
return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains)

36
src/logger.py Normal file
View File

@@ -0,0 +1,36 @@
import logging
from datetime import datetime
from typing import Optional
class Logger:
_instance: Optional['Logger'] = None
def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._initialize()
return cls._instance
def _initialize(self):
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'logs/main_{datetime.now().strftime("%Y%m%d")}.log'),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
def info(self, message: str):
self.logger.info(message)
def error(self, message: str):
self.logger.error(message)
def warning(self, message: str):
self.logger.warning(message)
def debug(self, message: str):
self.logger.debug(message)

81
src/main.py Normal file
View File

@@ -0,0 +1,81 @@
from fastapi import FastAPI, WebSocket
import uvicorn
from logger import Logger
import os
from database import Database
from crawl4ai import AsyncWebCrawler
from domain_exclusions import DomainExclusions
from urllib.parse import urlparse
# Create logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)
app = FastAPI()
logger = Logger()
db = Database()
domain_exclusions = DomainExclusions() # Initialize with default config path
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
logger.info("New WebSocket connection established")
# Create crawler instance outside the loop for reuse
async with AsyncWebCrawler() as crawler:
try:
while True:
data = await websocket.receive_json()
url = data["url"]
domain = urlparse(url).netloc
# Check if domain is excluded
if domain_exclusions.is_excluded(domain):
logger.info(f"Skipping excluded domain: {domain}")
await websocket.send_json({
"status": "skipped",
"data": {
"url": url,
"title": "Excluded Domain",
"timestamp": data["timestamp"]
}
})
continue
# Crawl the URL to get title and content
try:
result = await crawler.arun(url=url)
# Get the first result from the container and access metadata
crawl_result = result[0]
title = crawl_result.metadata.get('title') or url.split("/")[-1]
content = crawl_result.markdown
except Exception as crawl_error:
logger.error(f"Crawling error for {url}: {str(crawl_error)}")
title = url.split("/")[-1]
content = str(data)
# Store received data with crawled information
db.add_history(
url=url,
title=title,
content=content
)
logger.info(f"Processed URL: {url} - {title}")
await websocket.send_json({
"status": "received",
"data": {
"url": url,
"title": title,
"timestamp": data["timestamp"]
}
})
except Exception as e:
logger.error(f"WebSocket error: {str(e)}")
await websocket.close()
finally:
logger.info("WebSocket connection closed")
if __name__ == "__main__":
logger.info("Starting WebSocket server...")
uvicorn.run(app, host="0.0.0.0", port=8523)