Move files, add domain exclusion

2025-12-06 10:29:38 +00:00 · 2025-04-12 12:53:48 -05:00
parent cac52687c8
commit 1cf86dd48b
5 changed files with 59 additions and 8 deletions
--- a/src/database.py
+++ b/src/database.py
@@ -0,0 +1,110 @@
+import sqlite3
+from datetime import datetime
+from typing import Optional, List, Dict
+import threading
+
+class Database:
+    _instance = None
+    _lock = threading.Lock()
+
+    def __new__(cls):
+        with cls._lock:
+            if cls._instance is None:
+                cls._instance = super(Database, cls).__new__(cls)
+                cls._instance._initialize_db()
+            return cls._instance
+
+    def _initialize_db(self):
+        """Initialize the database connection and create tables if they don't exist."""
+        self.conn = sqlite3.connect('history.db', check_same_thread=False)
+        self.conn.row_factory = sqlite3.Row
+
+        try:
+            # Set WAL mode first, before any other operations
+            self.conn.execute('PRAGMA journal_mode=WAL')
+
+            # Other performance and reliability optimizations
+            self.conn.execute('PRAGMA synchronous=NORMAL')  # Balance between safety and speed
+            self.conn.execute('PRAGMA temp_store=MEMORY')   # Store temp tables and indices in memory
+            self.conn.execute('PRAGMA cache_size=-64000')   # Use 64MB of memory for page cache
+            self.conn.execute('PRAGMA foreign_keys=ON')     # Enable foreign key constraints
+        except Exception as e:
+            print(f"Error setting database PRAGMA options: {e}")
+            # Optionally re-raise the exception if you want to halt execution
+            raise
+
+        self.cursor = self.conn.cursor()
+
+        # Create history table
+        self.cursor.execute('''
+            CREATE TABLE IF NOT EXISTS history (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                url TEXT NOT NULL,
+                title TEXT NOT NULL,
+                content TEXT NOT NULL,
+                created TIMESTAMP NOT NULL,
+                updated TIMESTAMP NOT NULL
+            )
+        ''')
+        self.conn.commit()
+
+    def add_history(self, url: str, title: str, content: str) -> int:
+        """Add a new history entry."""
+        now = datetime.utcnow()
+        with self._lock:
+            self.cursor.execute('''
+                INSERT INTO history (url, title, content, created, updated)
+                VALUES (?, ?, ?, ?, ?)
+            ''', (url, title, content, now, now))
+            self.conn.commit()
+            return self.cursor.lastrowid
+
+    def get_history(self, limit: int = 100) -> List[Dict]:
+        """Get history entries, ordered by most recent first."""
+        self.cursor.execute('''
+            SELECT * FROM history
+            ORDER BY created DESC
+            LIMIT ?
+        ''', (limit,))
+        return [dict(row) for row in self.cursor.fetchall()]
+
+    def update_history(self, id: int, title: Optional[str] = None,
+                      content: Optional[str] = None) -> bool:
+        """Update an existing history entry."""
+        update_fields = []
+        values = []
+
+        if title is not None:
+            update_fields.append("title = ?")
+            values.append(title)
+        if content is not None:
+            update_fields.append("content = ?")
+            values.append(content)
+
+        if not update_fields:
+            return False
+
+        update_fields.append("updated = ?")
+        values.append(datetime.utcnow())
+        values.append(id)
+
+        with self._lock:
+            self.cursor.execute(f'''
+                UPDATE history
+                SET {", ".join(update_fields)}
+                WHERE id = ?
+            ''', values)
+            self.conn.commit()
+            return self.cursor.rowcount > 0
+
+    def delete_history(self, id: int) -> bool:
+        """Delete a history entry."""
+        with self._lock:
+            self.cursor.execute('DELETE FROM history WHERE id = ?', (id,))
+            self.conn.commit()
+            return self.cursor.rowcount > 0
+
+    def __del__(self):
+        """Cleanup database connection."""
+        if hasattr(self, 'conn'):
+            self.conn.close()
--- a/src/domain_exclusions.py
+++ b/src/domain_exclusions.py
@@ -0,0 +1,34 @@
+import yaml
+from fnmatch import fnmatch
+
+class DomainExclusions:
+    def __init__(self, config_path="history_config.yaml"):
+        self.excluded_domains = []
+        self.load_config(config_path)
+
+    def load_config(self, config_path):
+        """Load excluded domains from the YAML configuration file."""
+        try:
+            with open(config_path, 'r') as f:
+                config = yaml.safe_load(f)
+
+            # Get the excluded_domains list from config, defaulting to empty list if not found
+            self.excluded_domains = config.get('excluded_domains', [])
+        except FileNotFoundError:
+            print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
+        except yaml.YAMLError as e:
+            print(f"Error parsing YAML configuration: {e}")
+            self.excluded_domains = []
+
+    def is_excluded(self, domain):
+        """
+        Check if a domain matches any of the excluded domain patterns.
+        Supports wildcards (*, ?) in the excluded domain patterns.
+
+        Args:
+            domain (str): The domain to check
+
+        Returns:
+            bool: True if the domain should be excluded, False otherwise
+        """
+        return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains)
--- a/src/logger.py
+++ b/src/logger.py
@@ -0,0 +1,36 @@
+import logging
+from datetime import datetime
+from typing import Optional
+
+class Logger:
+    _instance: Optional['Logger'] = None
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._initialize()
+        return cls._instance
+
+    def _initialize(self):
+        # Configure logging
+        logging.basicConfig(
+            level=logging.INFO,
+            format='%(asctime)s - %(levelname)s - %(message)s',
+            handlers=[
+                logging.FileHandler(f'logs/main_{datetime.now().strftime("%Y%m%d")}.log'),
+                logging.StreamHandler()
+            ]
+        )
+        self.logger = logging.getLogger(__name__)
+
+    def info(self, message: str):
+        self.logger.info(message)
+
+    def error(self, message: str):
+        self.logger.error(message)
+
+    def warning(self, message: str):
+        self.logger.warning(message)
+
+    def debug(self, message: str):
+        self.logger.debug(message)
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,81 @@
+from fastapi import FastAPI, WebSocket
+import uvicorn
+from logger import Logger
+import os
+from database import Database
+from crawl4ai import AsyncWebCrawler
+from domain_exclusions import DomainExclusions
+from urllib.parse import urlparse
+
+# Create logs directory if it doesn't exist
+os.makedirs('logs', exist_ok=True)
+
+app = FastAPI()
+logger = Logger()
+
+db = Database()
+domain_exclusions = DomainExclusions()  # Initialize with default config path
+
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    logger.info("New WebSocket connection established")
+
+    # Create crawler instance outside the loop for reuse
+    async with AsyncWebCrawler() as crawler:
+        try:
+            while True:
+                data = await websocket.receive_json()
+                url = data["url"]
+                domain = urlparse(url).netloc
+
+                # Check if domain is excluded
+                if domain_exclusions.is_excluded(domain):
+                    logger.info(f"Skipping excluded domain: {domain}")
+                    await websocket.send_json({
+                        "status": "skipped",
+                        "data": {
+                            "url": url,
+                            "title": "Excluded Domain",
+                            "timestamp": data["timestamp"]
+                        }
+                    })
+                    continue
+
+                # Crawl the URL to get title and content
+                try:
+                    result = await crawler.arun(url=url)
+                    # Get the first result from the container and access metadata
+                    crawl_result = result[0]
+                    title = crawl_result.metadata.get('title') or url.split("/")[-1]
+                    content = crawl_result.markdown
+                except Exception as crawl_error:
+                    logger.error(f"Crawling error for {url}: {str(crawl_error)}")
+                    title = url.split("/")[-1]
+                    content = str(data)
+
+                # Store received data with crawled information
+                db.add_history(
+                    url=url,
+                    title=title,
+                    content=content
+                )
+
+                logger.info(f"Processed URL: {url} - {title}")
+                await websocket.send_json({
+                    "status": "received",
+                    "data": {
+                        "url": url,
+                        "title": title,
+                        "timestamp": data["timestamp"]
+                    }
+                })
+        except Exception as e:
+            logger.error(f"WebSocket error: {str(e)}")
+            await websocket.close()
+        finally:
+            logger.info("WebSocket connection closed")
+
+if __name__ == "__main__":
+    logger.info("Starting WebSocket server...")
+    uvicorn.run(app, host="0.0.0.0", port=8523)