diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..d4c3740
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,6 @@
+# Meilisearch Configuration
+MEILISEARCH_HOST=http://localhost:7700
+# Generate a master key using: openssl rand -hex 32
+MEILISEARCH_MASTER_KEY=your_master_key_here
+
+# Example master key: 6d99b335033595ea62d02a5641b94e04e80c33c1e1f1f789c84445ff5
\ No newline at end of file
diff --git a/app/__init__.py b/app/__init__.py
new file mode 100644
index 0000000..e188c78
--- /dev/null
+++ b/app/__init__.py
@@ -0,0 +1 @@
+# This file can be empty, it just marks the directory as a Python package
\ No newline at end of file
diff --git a/app/config.py b/app/config.py
new file mode 100644
index 0000000..31ba229
--- /dev/null
+++ b/app/config.py
@@ -0,0 +1,92 @@
+import yaml
+from pathlib import Path
+from typing import Set
+import fnmatch
+
+class ReaderConfig:
+ def __init__(self):
+ self.excluded_patterns: Set[str] = set()
+ self._load_config()
+
+ def _load_config(self):
+ config_path = Path("config/reader_config.yaml")
+ if not config_path.exists():
+ print("Warning: reader_config.yaml not found, creating default config")
+ self._create_default_config(config_path)
+
+ try:
+ with open(config_path, 'r') as f:
+ config = yaml.safe_load(f)
+ self.excluded_patterns = set(config.get('excluded_domains', []))
+ except Exception as e:
+ print(f"Error loading config: {e}")
+ self.excluded_patterns = set()
+
+ def _create_default_config(self, config_path: Path):
+ config_path.parent.mkdir(parents=True, exist_ok=True)
+ default_config = {
+ 'excluded_domains': [
+ 'localhost',
+ '127.0.0.1',
+ '192.168.*.*',
+ '10.*.*.*'
+ ]
+ }
+ with open(config_path, 'w') as f:
+ yaml.safe_dump(default_config, f, default_flow_style=False)
+
+ def is_domain_excluded(self, domain: str) -> bool:
+ """
+ Check if a domain matches any exclusion pattern.
+ Supports glob-style wildcards (* and ?)
+ Examples:
+ - '*.example.com' matches any subdomain of example.com
+ - 'reddit-*.com' matches reddit-video.com, reddit-static.com, etc.
+ - '192.168.*.*' matches any IP in the 192.168.0.0/16 subnet
+ """
+ domain = domain.lower()
+
+ # Check each pattern
+ for pattern in self.excluded_patterns:
+ pattern = pattern.lower()
+
+ # Handle IP address patterns specially
+ if any(c.isdigit() for c in pattern):
+ if self._match_ip_pattern(domain, pattern):
+ return True
+
+ # Handle domain patterns
+ if fnmatch.fnmatch(domain, pattern):
+ return True
+ # Also check if the pattern matches when prepended with a dot
+ # This handles cases like 'example.com' matching 'subdomain.example.com'
+ if fnmatch.fnmatch(domain, f"*.{pattern}"):
+ return True
+
+ return False
+
+ def _match_ip_pattern(self, domain: str, pattern: str) -> bool:
+ """
+ Special handling for IP address patterns.
+ Handles cases like '192.168.*.*' matching '192.168.1.1'
+ """
+ # Skip if domain isn't IP-like
+ if not any(c.isdigit() for c in domain):
+ return False
+
+ # Split into octets
+ domain_parts = domain.split('.')
+ pattern_parts = pattern.split('.')
+
+ # Must have same number of parts
+ if len(domain_parts) != len(pattern_parts):
+ return False
+
+ # Check each octet
+ for domain_part, pattern_part in zip(domain_parts, pattern_parts):
+ if pattern_part == '*':
+ continue
+ if domain_part != pattern_part:
+ return False
+
+ return True
\ No newline at end of file
diff --git a/app/database.py b/app/database.py
index 391fe80..c8edecf 100644
--- a/app/database.py
+++ b/app/database.py
@@ -1,6 +1,7 @@
-from sqlalchemy import create_engine, Column, Integer, String, DateTime
+from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
+from datetime import datetime
SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db"
@@ -10,13 +11,15 @@ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()
class HistoryEntry(Base):
- __tablename__ = "history_entries"
+ __tablename__ = "history"
- id = Column(Integer, primary_key=True, index=True)
- url = Column(String, index=True)
- title = Column(String, nullable=True)
- visit_time = Column(DateTime, index=True)
- domain = Column(String, index=True)
+ id = Column(Integer, primary_key=True)
+ url = Column(String)
+ title = Column(String)
+ visit_time = Column(DateTime)
+ domain = Column(String)
+ markdown_content = Column(Text, nullable=True)
+ last_content_update = Column(DateTime, nullable=True)
class Bookmark(Base):
__tablename__ = "bookmarks"
@@ -28,6 +31,37 @@ class Bookmark(Base):
folder = Column(String, index=True)
domain = Column(String, index=True)
+class BlacklistedDomain(Base):
+ __tablename__ = "blacklisted_domains"
+
+ id = Column(Integer, primary_key=True)
+ domain = Column(String, unique=True, index=True)
+ reason = Column(String, nullable=True)
+ added_time = Column(DateTime, default=datetime.utcnow)
+
+ @classmethod
+ def is_blacklisted(cls, db: SessionLocal, domain: str) -> bool:
+ """Check if a domain is blacklisted"""
+ return db.query(cls).filter(cls.domain == domain.lower()).first() is not None
+
+ @classmethod
+ def add_to_blacklist(cls, db: SessionLocal, domain: str, reason: str = None):
+ """Add a domain to the blacklist"""
+ try:
+ blacklist_entry = cls(
+ domain=domain.lower(),
+ reason=reason
+ )
+ db.add(blacklist_entry)
+ db.commit()
+ except:
+ db.rollback()
+ # If entry already exists, just update the reason
+ existing = db.query(cls).filter(cls.domain == domain.lower()).first()
+ if existing and reason:
+ existing.reason = reason
+ db.commit()
+
Base.metadata.create_all(bind=engine)
def get_db():
diff --git a/app/main.py b/app/main.py
index 13f6feb..dcf2866 100644
--- a/app/main.py
+++ b/app/main.py
@@ -1,15 +1,31 @@
-from fastapi import FastAPI, Depends, Query
+from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect
from sqlalchemy.orm import Session
-from datetime import datetime
-from typing import List
+from datetime import datetime, timezone
+from typing import List, Optional
import asyncio
+from fastapi import WebSocketDisconnect
+from urllib.parse import urlparse
+import pytz
+from fastapi.middleware.cors import CORSMiddleware
+import iso8601
from .database import get_db, HistoryEntry, Bookmark
from .scheduler import HistoryScheduler
+from .page_info import PageInfo
+from .page_reader import PageReader
app = FastAPI()
scheduler = HistoryScheduler()
+# Add CORS middleware to allow WebSocket connections
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"], # In production, specify your domains
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
@app.on_event("startup")
async def startup_event():
# Initial bookmark fetch
@@ -17,12 +33,37 @@ async def startup_event():
# Start the background task
asyncio.create_task(scheduler.update_history())
+def serialize_history_entry(entry, include_content: bool = False):
+ """Serialize a HistoryEntry object to a dictionary"""
+ result = {
+ "id": entry.id,
+ "url": entry.url,
+ "title": entry.title,
+ "visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
+ "domain": entry.domain,
+ }
+ if include_content:
+ result["markdown_content"] = entry.markdown_content
+ return result
+
+def serialize_bookmark(bookmark):
+ """Serialize a Bookmark object to a dictionary"""
+ return {
+ "id": bookmark.id,
+ "url": bookmark.url,
+ "title": bookmark.title,
+ "added_time": bookmark.added_time.isoformat() if bookmark.added_time else None,
+ "folder": bookmark.folder,
+ "domain": bookmark.domain,
+ }
+
@app.get("/history/search")
async def search_history(
- domain: str = Query(None),
- start_date: datetime = Query(None),
- end_date: datetime = Query(None),
- search_term: str = Query(None),
+ domain: Optional[str] = Query(None),
+ start_date: Optional[datetime] = Query(None),
+ end_date: Optional[datetime] = Query(None),
+ search_term: Optional[str] = Query(None),
+ include_content: bool = Query(False),
db: Session = Depends(get_db)
):
query = db.query(HistoryEntry)
@@ -37,15 +78,19 @@ async def search_history(
query = query.filter(HistoryEntry.visit_time <= end_date)
if search_term:
- query = query.filter(HistoryEntry.title.ilike(f"%{search_term}%"))
+ query = query.filter(
+ (HistoryEntry.title.ilike(f"%{search_term}%")) |
+ (HistoryEntry.markdown_content.ilike(f"%{search_term}%"))
+ )
- return query.all()
+ entries = query.all()
+ return [serialize_history_entry(entry, include_content) for entry in entries]
@app.get("/bookmarks/search")
async def search_bookmarks(
- domain: str = Query(None),
- folder: str = Query(None),
- search_term: str = Query(None),
+ domain: Optional[str] = Query(None),
+ folder: Optional[str] = Query(None),
+ search_term: Optional[str] = Query(None),
db: Session = Depends(get_db)
):
query = db.query(Bookmark)
@@ -59,4 +104,73 @@ async def search_bookmarks(
if search_term:
query = query.filter(Bookmark.title.ilike(f"%{search_term}%"))
- return query.all()
\ No newline at end of file
+ bookmarks = query.all()
+ return [serialize_bookmark(bookmark) for bookmark in bookmarks]
+
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
+ print("WebSocket endpoint called")
+ page_reader = PageReader()
+ print("New WebSocket connection established")
+ await websocket.accept()
+ print("WebSocket connection accepted")
+ try:
+ while True:
+ print("Waiting for message...")
+ data = await websocket.receive_json()
+ print(f"Received message for URL: {data['url']}")
+ print(f"HTML content length: {len(data['html'])}")
+ print(f"Timestamp: {data['timestamp']}")
+
+ # Parse the ISO timestamp correctly
+ timestamp = iso8601.parse_date(data['timestamp'])
+
+ page_info = PageInfo(
+ url=data['url'],
+ html=data['html'],
+ timestamp=timestamp
+ )
+ print(f"Created PageInfo object for: {page_info.url}")
+
+ # Convert HTML to markdown
+ print("Converting HTML to markdown...")
+ markdown_content = page_reader.html_to_markdown(page_info.html)
+ print(f"Markdown conversion complete, length: {len(markdown_content) if markdown_content else 0}")
+
+ # Update or create history entry
+ domain = urlparse(page_info.url).netloc
+ print(f"Creating history entry for domain: {domain}")
+ history_entry = HistoryEntry(
+ url=page_info.url,
+ visit_time=page_info.timestamp,
+ domain=domain,
+ markdown_content=markdown_content,
+ last_content_update=datetime.now(timezone.utc)
+ )
+
+ print("Saving to database...")
+ db.add(history_entry)
+ db.commit()
+ print("Database save complete")
+
+ # Send confirmation back to client
+ await websocket.send_json({
+ "status": "success",
+ "message": f"Processed page: {page_info.url}"
+ })
+
+ except WebSocketDisconnect:
+ print("Client disconnected")
+ except Exception as e:
+ print(f"Error handling message: {e}")
+ # Send error back to client if possible
+ try:
+ await websocket.send_json({
+ "status": "error",
+ "message": str(e)
+ })
+ except:
+ pass
+ finally:
+ print("Cleaning up resources")
+ page_reader.close()
\ No newline at end of file
diff --git a/app/page_info.py b/app/page_info.py
index 404104f..5e5ae4b 100644
--- a/app/page_info.py
+++ b/app/page_info.py
@@ -1,16 +1,8 @@
-import asyncio
-import aiohttp
-from bs4 import BeautifulSoup
-from typing import Optional
+from dataclasses import dataclass
+from datetime import datetime
-class PageInfoFetcher:
- async def get_page_title(self, url: str) -> Optional[str]:
- try:
- async with aiohttp.ClientSession() as session:
- async with session.get(url, timeout=5) as response:
- if response.status == 200:
- html = await response.text()
- soup = BeautifulSoup(html, 'html.parser')
- return soup.title.string if soup.title else None
- except:
- return None
\ No newline at end of file
+@dataclass
+class PageInfo:
+ url: str
+ html: str
+ timestamp: datetime
\ No newline at end of file
diff --git a/app/page_reader.py b/app/page_reader.py
new file mode 100644
index 0000000..e2d8175
--- /dev/null
+++ b/app/page_reader.py
@@ -0,0 +1,101 @@
+import re
+from markdownify import markdownify as md
+from bs4 import BeautifulSoup
+from typing import Optional
+from urllib.parse import urlparse
+from .config import ReaderConfig
+import logging
+from .database import SessionLocal, BlacklistedDomain
+
+# Setup logging with less verbose output
+logging.basicConfig(
+ level=logging.WARNING,
+ format='%(levelname)s: %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Patterns for cleaning
+SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
+STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
+META_PATTERN = r"<[ ]*meta.*?>"
+COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
+LINK_PATTERN = r"<[ ]*link.*?>"
+BASE64_IMG_PATTERN = r'
]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
+SVG_PATTERN = r"(