mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 10:29:38 +00:00
All the things
This commit is contained in:
6
.env.example
Normal file
6
.env.example
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
# Meilisearch Configuration
|
||||||
|
MEILISEARCH_HOST=http://localhost:7700
|
||||||
|
# Generate a master key using: openssl rand -hex 32
|
||||||
|
MEILISEARCH_MASTER_KEY=your_master_key_here
|
||||||
|
|
||||||
|
# Example master key: 6d99b335033595ea62d02a5641b94e04e80c33c1e1f1f789c84445ff5
|
||||||
1
app/__init__.py
Normal file
1
app/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
# This file can be empty, it just marks the directory as a Python package
|
||||||
92
app/config.py
Normal file
92
app/config.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
import yaml
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Set
|
||||||
|
import fnmatch
|
||||||
|
|
||||||
|
class ReaderConfig:
|
||||||
|
def __init__(self):
|
||||||
|
self.excluded_patterns: Set[str] = set()
|
||||||
|
self._load_config()
|
||||||
|
|
||||||
|
def _load_config(self):
|
||||||
|
config_path = Path("config/reader_config.yaml")
|
||||||
|
if not config_path.exists():
|
||||||
|
print("Warning: reader_config.yaml not found, creating default config")
|
||||||
|
self._create_default_config(config_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(config_path, 'r') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
self.excluded_patterns = set(config.get('excluded_domains', []))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading config: {e}")
|
||||||
|
self.excluded_patterns = set()
|
||||||
|
|
||||||
|
def _create_default_config(self, config_path: Path):
|
||||||
|
config_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
default_config = {
|
||||||
|
'excluded_domains': [
|
||||||
|
'localhost',
|
||||||
|
'127.0.0.1',
|
||||||
|
'192.168.*.*',
|
||||||
|
'10.*.*.*'
|
||||||
|
]
|
||||||
|
}
|
||||||
|
with open(config_path, 'w') as f:
|
||||||
|
yaml.safe_dump(default_config, f, default_flow_style=False)
|
||||||
|
|
||||||
|
def is_domain_excluded(self, domain: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a domain matches any exclusion pattern.
|
||||||
|
Supports glob-style wildcards (* and ?)
|
||||||
|
Examples:
|
||||||
|
- '*.example.com' matches any subdomain of example.com
|
||||||
|
- 'reddit-*.com' matches reddit-video.com, reddit-static.com, etc.
|
||||||
|
- '192.168.*.*' matches any IP in the 192.168.0.0/16 subnet
|
||||||
|
"""
|
||||||
|
domain = domain.lower()
|
||||||
|
|
||||||
|
# Check each pattern
|
||||||
|
for pattern in self.excluded_patterns:
|
||||||
|
pattern = pattern.lower()
|
||||||
|
|
||||||
|
# Handle IP address patterns specially
|
||||||
|
if any(c.isdigit() for c in pattern):
|
||||||
|
if self._match_ip_pattern(domain, pattern):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Handle domain patterns
|
||||||
|
if fnmatch.fnmatch(domain, pattern):
|
||||||
|
return True
|
||||||
|
# Also check if the pattern matches when prepended with a dot
|
||||||
|
# This handles cases like 'example.com' matching 'subdomain.example.com'
|
||||||
|
if fnmatch.fnmatch(domain, f"*.{pattern}"):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _match_ip_pattern(self, domain: str, pattern: str) -> bool:
|
||||||
|
"""
|
||||||
|
Special handling for IP address patterns.
|
||||||
|
Handles cases like '192.168.*.*' matching '192.168.1.1'
|
||||||
|
"""
|
||||||
|
# Skip if domain isn't IP-like
|
||||||
|
if not any(c.isdigit() for c in domain):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Split into octets
|
||||||
|
domain_parts = domain.split('.')
|
||||||
|
pattern_parts = pattern.split('.')
|
||||||
|
|
||||||
|
# Must have same number of parts
|
||||||
|
if len(domain_parts) != len(pattern_parts):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check each octet
|
||||||
|
for domain_part, pattern_part in zip(domain_parts, pattern_parts):
|
||||||
|
if pattern_part == '*':
|
||||||
|
continue
|
||||||
|
if domain_part != pattern_part:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
from sqlalchemy import create_engine, Column, Integer, String, DateTime
|
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db"
|
SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db"
|
||||||
|
|
||||||
@@ -10,13 +11,15 @@ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|
||||||
class HistoryEntry(Base):
|
class HistoryEntry(Base):
|
||||||
__tablename__ = "history_entries"
|
__tablename__ = "history"
|
||||||
|
|
||||||
id = Column(Integer, primary_key=True, index=True)
|
id = Column(Integer, primary_key=True)
|
||||||
url = Column(String, index=True)
|
url = Column(String)
|
||||||
title = Column(String, nullable=True)
|
title = Column(String)
|
||||||
visit_time = Column(DateTime, index=True)
|
visit_time = Column(DateTime)
|
||||||
domain = Column(String, index=True)
|
domain = Column(String)
|
||||||
|
markdown_content = Column(Text, nullable=True)
|
||||||
|
last_content_update = Column(DateTime, nullable=True)
|
||||||
|
|
||||||
class Bookmark(Base):
|
class Bookmark(Base):
|
||||||
__tablename__ = "bookmarks"
|
__tablename__ = "bookmarks"
|
||||||
@@ -28,6 +31,37 @@ class Bookmark(Base):
|
|||||||
folder = Column(String, index=True)
|
folder = Column(String, index=True)
|
||||||
domain = Column(String, index=True)
|
domain = Column(String, index=True)
|
||||||
|
|
||||||
|
class BlacklistedDomain(Base):
|
||||||
|
__tablename__ = "blacklisted_domains"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
domain = Column(String, unique=True, index=True)
|
||||||
|
reason = Column(String, nullable=True)
|
||||||
|
added_time = Column(DateTime, default=datetime.utcnow)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_blacklisted(cls, db: SessionLocal, domain: str) -> bool:
|
||||||
|
"""Check if a domain is blacklisted"""
|
||||||
|
return db.query(cls).filter(cls.domain == domain.lower()).first() is not None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def add_to_blacklist(cls, db: SessionLocal, domain: str, reason: str = None):
|
||||||
|
"""Add a domain to the blacklist"""
|
||||||
|
try:
|
||||||
|
blacklist_entry = cls(
|
||||||
|
domain=domain.lower(),
|
||||||
|
reason=reason
|
||||||
|
)
|
||||||
|
db.add(blacklist_entry)
|
||||||
|
db.commit()
|
||||||
|
except:
|
||||||
|
db.rollback()
|
||||||
|
# If entry already exists, just update the reason
|
||||||
|
existing = db.query(cls).filter(cls.domain == domain.lower()).first()
|
||||||
|
if existing and reason:
|
||||||
|
existing.reason = reason
|
||||||
|
db.commit()
|
||||||
|
|
||||||
Base.metadata.create_all(bind=engine)
|
Base.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
def get_db():
|
def get_db():
|
||||||
|
|||||||
140
app/main.py
140
app/main.py
@@ -1,15 +1,31 @@
|
|||||||
from fastapi import FastAPI, Depends, Query
|
from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
import asyncio
|
import asyncio
|
||||||
|
from fastapi import WebSocketDisconnect
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import pytz
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
import iso8601
|
||||||
|
|
||||||
from .database import get_db, HistoryEntry, Bookmark
|
from .database import get_db, HistoryEntry, Bookmark
|
||||||
from .scheduler import HistoryScheduler
|
from .scheduler import HistoryScheduler
|
||||||
|
from .page_info import PageInfo
|
||||||
|
from .page_reader import PageReader
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
scheduler = HistoryScheduler()
|
scheduler = HistoryScheduler()
|
||||||
|
|
||||||
|
# Add CORS middleware to allow WebSocket connections
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"], # In production, specify your domains
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
@app.on_event("startup")
|
@app.on_event("startup")
|
||||||
async def startup_event():
|
async def startup_event():
|
||||||
# Initial bookmark fetch
|
# Initial bookmark fetch
|
||||||
@@ -17,12 +33,37 @@ async def startup_event():
|
|||||||
# Start the background task
|
# Start the background task
|
||||||
asyncio.create_task(scheduler.update_history())
|
asyncio.create_task(scheduler.update_history())
|
||||||
|
|
||||||
|
def serialize_history_entry(entry, include_content: bool = False):
|
||||||
|
"""Serialize a HistoryEntry object to a dictionary"""
|
||||||
|
result = {
|
||||||
|
"id": entry.id,
|
||||||
|
"url": entry.url,
|
||||||
|
"title": entry.title,
|
||||||
|
"visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
|
||||||
|
"domain": entry.domain,
|
||||||
|
}
|
||||||
|
if include_content:
|
||||||
|
result["markdown_content"] = entry.markdown_content
|
||||||
|
return result
|
||||||
|
|
||||||
|
def serialize_bookmark(bookmark):
|
||||||
|
"""Serialize a Bookmark object to a dictionary"""
|
||||||
|
return {
|
||||||
|
"id": bookmark.id,
|
||||||
|
"url": bookmark.url,
|
||||||
|
"title": bookmark.title,
|
||||||
|
"added_time": bookmark.added_time.isoformat() if bookmark.added_time else None,
|
||||||
|
"folder": bookmark.folder,
|
||||||
|
"domain": bookmark.domain,
|
||||||
|
}
|
||||||
|
|
||||||
@app.get("/history/search")
|
@app.get("/history/search")
|
||||||
async def search_history(
|
async def search_history(
|
||||||
domain: str = Query(None),
|
domain: Optional[str] = Query(None),
|
||||||
start_date: datetime = Query(None),
|
start_date: Optional[datetime] = Query(None),
|
||||||
end_date: datetime = Query(None),
|
end_date: Optional[datetime] = Query(None),
|
||||||
search_term: str = Query(None),
|
search_term: Optional[str] = Query(None),
|
||||||
|
include_content: bool = Query(False),
|
||||||
db: Session = Depends(get_db)
|
db: Session = Depends(get_db)
|
||||||
):
|
):
|
||||||
query = db.query(HistoryEntry)
|
query = db.query(HistoryEntry)
|
||||||
@@ -37,15 +78,19 @@ async def search_history(
|
|||||||
query = query.filter(HistoryEntry.visit_time <= end_date)
|
query = query.filter(HistoryEntry.visit_time <= end_date)
|
||||||
|
|
||||||
if search_term:
|
if search_term:
|
||||||
query = query.filter(HistoryEntry.title.ilike(f"%{search_term}%"))
|
query = query.filter(
|
||||||
|
(HistoryEntry.title.ilike(f"%{search_term}%")) |
|
||||||
|
(HistoryEntry.markdown_content.ilike(f"%{search_term}%"))
|
||||||
|
)
|
||||||
|
|
||||||
return query.all()
|
entries = query.all()
|
||||||
|
return [serialize_history_entry(entry, include_content) for entry in entries]
|
||||||
|
|
||||||
@app.get("/bookmarks/search")
|
@app.get("/bookmarks/search")
|
||||||
async def search_bookmarks(
|
async def search_bookmarks(
|
||||||
domain: str = Query(None),
|
domain: Optional[str] = Query(None),
|
||||||
folder: str = Query(None),
|
folder: Optional[str] = Query(None),
|
||||||
search_term: str = Query(None),
|
search_term: Optional[str] = Query(None),
|
||||||
db: Session = Depends(get_db)
|
db: Session = Depends(get_db)
|
||||||
):
|
):
|
||||||
query = db.query(Bookmark)
|
query = db.query(Bookmark)
|
||||||
@@ -59,4 +104,73 @@ async def search_bookmarks(
|
|||||||
if search_term:
|
if search_term:
|
||||||
query = query.filter(Bookmark.title.ilike(f"%{search_term}%"))
|
query = query.filter(Bookmark.title.ilike(f"%{search_term}%"))
|
||||||
|
|
||||||
return query.all()
|
bookmarks = query.all()
|
||||||
|
return [serialize_bookmark(bookmark) for bookmark in bookmarks]
|
||||||
|
|
||||||
|
@app.websocket("/ws")
|
||||||
|
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
|
||||||
|
print("WebSocket endpoint called")
|
||||||
|
page_reader = PageReader()
|
||||||
|
print("New WebSocket connection established")
|
||||||
|
await websocket.accept()
|
||||||
|
print("WebSocket connection accepted")
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
print("Waiting for message...")
|
||||||
|
data = await websocket.receive_json()
|
||||||
|
print(f"Received message for URL: {data['url']}")
|
||||||
|
print(f"HTML content length: {len(data['html'])}")
|
||||||
|
print(f"Timestamp: {data['timestamp']}")
|
||||||
|
|
||||||
|
# Parse the ISO timestamp correctly
|
||||||
|
timestamp = iso8601.parse_date(data['timestamp'])
|
||||||
|
|
||||||
|
page_info = PageInfo(
|
||||||
|
url=data['url'],
|
||||||
|
html=data['html'],
|
||||||
|
timestamp=timestamp
|
||||||
|
)
|
||||||
|
print(f"Created PageInfo object for: {page_info.url}")
|
||||||
|
|
||||||
|
# Convert HTML to markdown
|
||||||
|
print("Converting HTML to markdown...")
|
||||||
|
markdown_content = page_reader.html_to_markdown(page_info.html)
|
||||||
|
print(f"Markdown conversion complete, length: {len(markdown_content) if markdown_content else 0}")
|
||||||
|
|
||||||
|
# Update or create history entry
|
||||||
|
domain = urlparse(page_info.url).netloc
|
||||||
|
print(f"Creating history entry for domain: {domain}")
|
||||||
|
history_entry = HistoryEntry(
|
||||||
|
url=page_info.url,
|
||||||
|
visit_time=page_info.timestamp,
|
||||||
|
domain=domain,
|
||||||
|
markdown_content=markdown_content,
|
||||||
|
last_content_update=datetime.now(timezone.utc)
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Saving to database...")
|
||||||
|
db.add(history_entry)
|
||||||
|
db.commit()
|
||||||
|
print("Database save complete")
|
||||||
|
|
||||||
|
# Send confirmation back to client
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "success",
|
||||||
|
"message": f"Processed page: {page_info.url}"
|
||||||
|
})
|
||||||
|
|
||||||
|
except WebSocketDisconnect:
|
||||||
|
print("Client disconnected")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error handling message: {e}")
|
||||||
|
# Send error back to client if possible
|
||||||
|
try:
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "error",
|
||||||
|
"message": str(e)
|
||||||
|
})
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
print("Cleaning up resources")
|
||||||
|
page_reader.close()
|
||||||
@@ -1,16 +1,8 @@
|
|||||||
import asyncio
|
from dataclasses import dataclass
|
||||||
import aiohttp
|
from datetime import datetime
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
class PageInfoFetcher:
|
@dataclass
|
||||||
async def get_page_title(self, url: str) -> Optional[str]:
|
class PageInfo:
|
||||||
try:
|
url: str
|
||||||
async with aiohttp.ClientSession() as session:
|
html: str
|
||||||
async with session.get(url, timeout=5) as response:
|
timestamp: datetime
|
||||||
if response.status == 200:
|
|
||||||
html = await response.text()
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
|
||||||
return soup.title.string if soup.title else None
|
|
||||||
except:
|
|
||||||
return None
|
|
||||||
101
app/page_reader.py
Normal file
101
app/page_reader.py
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
import re
|
||||||
|
from markdownify import markdownify as md
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from typing import Optional
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from .config import ReaderConfig
|
||||||
|
import logging
|
||||||
|
from .database import SessionLocal, BlacklistedDomain
|
||||||
|
|
||||||
|
# Setup logging with less verbose output
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.WARNING,
|
||||||
|
format='%(levelname)s: %(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Patterns for cleaning
|
||||||
|
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
|
||||||
|
STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
|
||||||
|
META_PATTERN = r"<[ ]*meta.*?>"
|
||||||
|
COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
|
||||||
|
LINK_PATTERN = r"<[ ]*link.*?>"
|
||||||
|
BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
|
||||||
|
SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
|
||||||
|
|
||||||
|
class PageReader:
|
||||||
|
def __init__(self):
|
||||||
|
self.config = ReaderConfig()
|
||||||
|
self.db = SessionLocal()
|
||||||
|
|
||||||
|
def clean_html(self, html: str) -> str:
|
||||||
|
"""Clean HTML by removing unwanted elements and patterns."""
|
||||||
|
if not html:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# First use regex to remove problematic patterns
|
||||||
|
html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
html = re.sub(BASE64_IMG_PATTERN, "", html)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Use BeautifulSoup to remove additional elements we want to strip
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
# Remove unwanted elements
|
||||||
|
elements_to_remove = [
|
||||||
|
'canvas', 'img', 'picture', 'audio', 'video',
|
||||||
|
'iframe', 'embed', 'object', 'param', 'track',
|
||||||
|
'map', 'area', 'source'
|
||||||
|
]
|
||||||
|
|
||||||
|
for element in elements_to_remove:
|
||||||
|
for tag in soup.find_all(element):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
return str(soup)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error cleaning HTML: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def clean_whitespace(self, text: str) -> str:
|
||||||
|
"""Clean excessive whitespace from text."""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Replace 3 or more newlines with 2 newlines
|
||||||
|
cleaned = re.sub(r'\n{3,}', '\n\n', text)
|
||||||
|
# Remove trailing whitespace from each line
|
||||||
|
cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines())
|
||||||
|
return cleaned.strip()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error cleaning whitespace: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def html_to_markdown(self, html: str) -> Optional[str]:
|
||||||
|
"""Convert HTML to markdown."""
|
||||||
|
try:
|
||||||
|
cleaned_html = self.clean_html(html)
|
||||||
|
if not cleaned_html:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self.clean_whitespace(md(cleaned_html,
|
||||||
|
heading_style="ATX",
|
||||||
|
bullets="-",
|
||||||
|
autolinks=True,
|
||||||
|
strip=['form'],
|
||||||
|
escape_asterisks=True,
|
||||||
|
escape_underscores=True))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error converting to markdown: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
"""Cleanup resources"""
|
||||||
|
self.db.close()
|
||||||
@@ -3,14 +3,29 @@ from datetime import datetime, timedelta
|
|||||||
import asyncio
|
import asyncio
|
||||||
from .database import SessionLocal, HistoryEntry, Bookmark
|
from .database import SessionLocal, HistoryEntry, Bookmark
|
||||||
from .browser import BrowserHistoryCollector
|
from .browser import BrowserHistoryCollector
|
||||||
from .page_info import PageInfoFetcher
|
from .page_reader import PageReader
|
||||||
from sqlalchemy import func
|
from sqlalchemy import func
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
import pytz
|
||||||
|
|
||||||
class HistoryScheduler:
|
class HistoryScheduler:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.browser_collector = BrowserHistoryCollector()
|
self.browser_collector = BrowserHistoryCollector()
|
||||||
self.page_fetcher = PageInfoFetcher()
|
self.page_reader = PageReader()
|
||||||
self.last_history_update = None
|
self.last_history_update = None
|
||||||
|
self.content_update_interval = timedelta(hours=24) # Update content daily
|
||||||
|
|
||||||
|
def _normalize_datetime(self, dt: datetime) -> datetime:
|
||||||
|
"""Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
|
||||||
|
if dt is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# If datetime is naive (no timezone), assume it's in UTC
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
return pytz.UTC.localize(dt)
|
||||||
|
|
||||||
|
# If datetime has timezone, convert to UTC
|
||||||
|
return dt.astimezone(pytz.UTC)
|
||||||
|
|
||||||
async def update_bookmarks(self):
|
async def update_bookmarks(self):
|
||||||
bookmarks = self.browser_collector.fetch_bookmarks()
|
bookmarks = self.browser_collector.fetch_bookmarks()
|
||||||
@@ -26,6 +41,9 @@ class HistoryScheduler:
|
|||||||
|
|
||||||
new_entries = []
|
new_entries = []
|
||||||
for added_time, url, title, folder in bookmarks:
|
for added_time, url, title, folder in bookmarks:
|
||||||
|
# Normalize the datetime
|
||||||
|
added_time = self._normalize_datetime(added_time)
|
||||||
|
|
||||||
# Only add if URL doesn't exist or if it's in a different folder
|
# Only add if URL doesn't exist or if it's in a different folder
|
||||||
if (url not in existing_urls or
|
if (url not in existing_urls or
|
||||||
existing_urls[url][1] != folder):
|
existing_urls[url][1] != folder):
|
||||||
@@ -51,6 +69,8 @@ class HistoryScheduler:
|
|||||||
try:
|
try:
|
||||||
# Get the latest timestamp from our database
|
# Get the latest timestamp from our database
|
||||||
latest_entry = db.query(func.max(HistoryEntry.visit_time)).scalar()
|
latest_entry = db.query(func.max(HistoryEntry.visit_time)).scalar()
|
||||||
|
if latest_entry:
|
||||||
|
latest_entry = self._normalize_datetime(latest_entry)
|
||||||
|
|
||||||
# Fetch new history
|
# Fetch new history
|
||||||
history = self.browser_collector.fetch_history()
|
history = self.browser_collector.fetch_history()
|
||||||
@@ -58,11 +78,11 @@ class HistoryScheduler:
|
|||||||
# Filter to only get entries newer than our latest entry
|
# Filter to only get entries newer than our latest entry
|
||||||
new_entries = []
|
new_entries = []
|
||||||
for visit_time, url, title in history:
|
for visit_time, url, title in history:
|
||||||
|
# Normalize the datetime
|
||||||
|
visit_time = self._normalize_datetime(visit_time)
|
||||||
|
|
||||||
if not latest_entry or visit_time > latest_entry:
|
if not latest_entry or visit_time > latest_entry:
|
||||||
domain = self.browser_collector.get_domain(url)
|
domain = self.browser_collector.get_domain(url)
|
||||||
if not title:
|
|
||||||
title = await self.page_fetcher.get_page_title(url)
|
|
||||||
|
|
||||||
entry = HistoryEntry(
|
entry = HistoryEntry(
|
||||||
url=url,
|
url=url,
|
||||||
title=title,
|
title=title,
|
||||||
@@ -83,3 +103,7 @@ class HistoryScheduler:
|
|||||||
|
|
||||||
# Wait for 5 minutes before next update
|
# Wait for 5 minutes before next update
|
||||||
await asyncio.sleep(300)
|
await asyncio.sleep(300)
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
"""Cleanup resources"""
|
||||||
|
await self.page_reader.close()
|
||||||
33
app/websocket_server.py
Normal file
33
app/websocket_server.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
import asyncio
|
||||||
|
import websockets
|
||||||
|
import json
|
||||||
|
from page_info import PageInfo
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
async def handle_websocket(websocket, path):
|
||||||
|
try:
|
||||||
|
async for message in websocket:
|
||||||
|
data = json.loads(message)
|
||||||
|
page_info = PageInfo(
|
||||||
|
url=data['url'],
|
||||||
|
html=data['html'],
|
||||||
|
timestamp=datetime.fromisoformat(data['timestamp'])
|
||||||
|
)
|
||||||
|
print(f"Received page content from: {page_info.url}")
|
||||||
|
# Here you can process the page_info object as needed
|
||||||
|
|
||||||
|
except websockets.exceptions.ConnectionClosed:
|
||||||
|
print("Client disconnected")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error handling message: {e}")
|
||||||
|
|
||||||
|
async def start_server():
|
||||||
|
server = await websockets.serve(handle_websocket, "localhost", 8765)
|
||||||
|
print("WebSocket server started on ws://localhost:8765")
|
||||||
|
await server.wait_closed()
|
||||||
|
|
||||||
|
def run_server():
|
||||||
|
asyncio.run(start_server())
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_server()
|
||||||
15
config/reader_config.yaml
Normal file
15
config/reader_config.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# Domains to exclude from content reading
|
||||||
|
excluded_domains:
|
||||||
|
# Local sites
|
||||||
|
- localhost
|
||||||
|
- 127.0.0.1
|
||||||
|
|
||||||
|
# IP ranges
|
||||||
|
- 192.168.*.*
|
||||||
|
- 10.*.*.*
|
||||||
|
- 172.16.*.*
|
||||||
|
|
||||||
|
# Example wildcard patterns
|
||||||
|
# - *.local
|
||||||
|
# - reddit-*.com
|
||||||
|
# - *.githubusercontent.com
|
||||||
47
extension/background.js
Normal file
47
extension/background.js
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
console.log("Background script loaded");
|
||||||
|
|
||||||
|
async function isContentScriptReady(tabId) {
|
||||||
|
try {
|
||||||
|
await browser.tabs.sendMessage(tabId, { type: "PING" });
|
||||||
|
return true;
|
||||||
|
} catch (error) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function waitForContentScript(tabId, maxAttempts = 10) {
|
||||||
|
console.log(`Waiting for content script in tab ${tabId}`);
|
||||||
|
for (let i = 0; i < maxAttempts; i++) {
|
||||||
|
if (await isContentScriptReady(tabId)) {
|
||||||
|
console.log(`Content script ready in tab ${tabId}`);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
console.log(`Attempt ${i + 1}: Content script not ready, waiting...`);
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 500));
|
||||||
|
}
|
||||||
|
console.log(`Content script not ready after ${maxAttempts} attempts`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function sendMessageToTab(tabId) {
|
||||||
|
try {
|
||||||
|
console.log(`Checking content script status for tab ${tabId}`);
|
||||||
|
if (await waitForContentScript(tabId)) {
|
||||||
|
console.log(`Sending GET_PAGE_CONTENT message to tab ${tabId}`);
|
||||||
|
await browser.tabs.sendMessage(tabId, {
|
||||||
|
type: "GET_PAGE_CONTENT"
|
||||||
|
});
|
||||||
|
console.log(`Successfully sent message to tab ${tabId}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error sending message to tab ${tabId}:`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
browser.webNavigation.onCompleted.addListener(async (details) => {
|
||||||
|
console.log("Navigation completed", details);
|
||||||
|
if (details.frameId === 0) { // Only handle main frame navigation
|
||||||
|
console.log(`Main frame navigation detected for tab ${details.tabId}`);
|
||||||
|
await sendMessageToTab(details.tabId);
|
||||||
|
}
|
||||||
|
});
|
||||||
132
extension/content.js
Normal file
132
extension/content.js
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
console.log("Content script starting initialization...");
|
||||||
|
|
||||||
|
// Function to log WebSocket state
|
||||||
|
function getWebSocketState(ws) {
|
||||||
|
const states = {
|
||||||
|
0: 'CONNECTING',
|
||||||
|
1: 'OPEN',
|
||||||
|
2: 'CLOSING',
|
||||||
|
3: 'CLOSED'
|
||||||
|
};
|
||||||
|
return states[ws.readyState] || 'UNKNOWN';
|
||||||
|
}
|
||||||
|
|
||||||
|
class WebSocketClient {
|
||||||
|
constructor() {
|
||||||
|
console.log("WebSocketClient constructor called");
|
||||||
|
this.messageQueue = [];
|
||||||
|
this.connect();
|
||||||
|
this.reconnectAttempts = 0;
|
||||||
|
this.maxReconnectAttempts = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
connect() {
|
||||||
|
console.log('Attempting to connect to WebSocket server...');
|
||||||
|
try {
|
||||||
|
this.ws = new WebSocket('ws://localhost:8523/ws');
|
||||||
|
console.log('WebSocket instance created');
|
||||||
|
|
||||||
|
this.ws.addEventListener('open', () => {
|
||||||
|
console.log('WebSocket connection opened successfully');
|
||||||
|
this.reconnectAttempts = 0;
|
||||||
|
// Process any queued messages
|
||||||
|
this.processQueue();
|
||||||
|
});
|
||||||
|
|
||||||
|
this.ws.addEventListener('error', (event) => {
|
||||||
|
console.error('WebSocket error occurred:', event);
|
||||||
|
});
|
||||||
|
|
||||||
|
this.ws.addEventListener('close', (event) => {
|
||||||
|
console.log('WebSocket connection closed:', event.code, event.reason);
|
||||||
|
this.tryReconnect();
|
||||||
|
});
|
||||||
|
|
||||||
|
this.ws.addEventListener('message', (event) => {
|
||||||
|
console.log('Received message from server:', event.data);
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error creating WebSocket:', error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
processQueue() {
|
||||||
|
console.log(`Processing message queue (${this.messageQueue.length} messages)`);
|
||||||
|
while (this.messageQueue.length > 0) {
|
||||||
|
const data = this.messageQueue.shift();
|
||||||
|
this.sendMessage(data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tryReconnect() {
|
||||||
|
if (this.reconnectAttempts < this.maxReconnectAttempts) {
|
||||||
|
this.reconnectAttempts++;
|
||||||
|
console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`);
|
||||||
|
setTimeout(() => this.connect(), 2000 * this.reconnectAttempts);
|
||||||
|
} else {
|
||||||
|
console.log('Max reconnection attempts reached');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sendMessage(data) {
|
||||||
|
console.log('sendMessage called, WebSocket state:', getWebSocketState(this.ws));
|
||||||
|
if (this.ws.readyState === WebSocket.OPEN) {
|
||||||
|
try {
|
||||||
|
console.log('Preparing to send data:', {
|
||||||
|
url: data.url,
|
||||||
|
timestamp: data.timestamp,
|
||||||
|
htmlLength: data.html.length
|
||||||
|
});
|
||||||
|
this.ws.send(JSON.stringify(data));
|
||||||
|
console.log('Data sent successfully');
|
||||||
|
return true;
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error sending data:', error);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.log('WebSocket not ready, queueing message');
|
||||||
|
this.messageQueue.push(data);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("Creating WebSocketClient instance...");
|
||||||
|
const wsClient = new WebSocketClient();
|
||||||
|
|
||||||
|
console.log("Setting up message listener...");
|
||||||
|
browser.runtime.onMessage.addListener((message, sender, sendResponse) => {
|
||||||
|
console.log('Message received from background script:', message);
|
||||||
|
|
||||||
|
if (message.type === "PING") {
|
||||||
|
console.log('Received PING, responding...');
|
||||||
|
return Promise.resolve({ status: "ready" });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (message.type === "GET_PAGE_CONTENT") {
|
||||||
|
console.log('Processing GET_PAGE_CONTENT message');
|
||||||
|
const pageContent = {
|
||||||
|
url: window.location.href,
|
||||||
|
html: document.documentElement.outerHTML,
|
||||||
|
timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log('Created page content object for:', pageContent.url);
|
||||||
|
wsClient.sendMessage(pageContent);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Send initial page content
|
||||||
|
console.log('Sending initial page content...');
|
||||||
|
const pageContent = {
|
||||||
|
url: window.location.href,
|
||||||
|
html: document.documentElement.outerHTML,
|
||||||
|
timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
|
||||||
|
};
|
||||||
|
|
||||||
|
wsClient.sendMessage(pageContent);
|
||||||
|
|
||||||
|
console.log("Content script initialization complete for:", window.location.href);
|
||||||
35
extension/manifest.json
Normal file
35
extension/manifest.json
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
{
|
||||||
|
"manifest_version": 2,
|
||||||
|
"name": "Page Content Sender",
|
||||||
|
"version": "1.0",
|
||||||
|
"description": "Sends page content via WebSocket when a page loads",
|
||||||
|
"permissions": [
|
||||||
|
"webNavigation",
|
||||||
|
"activeTab",
|
||||||
|
"<all_urls>",
|
||||||
|
"tabs"
|
||||||
|
],
|
||||||
|
"background": {
|
||||||
|
"scripts": [
|
||||||
|
"background.js"
|
||||||
|
],
|
||||||
|
"persistent": true
|
||||||
|
},
|
||||||
|
"content_scripts": [
|
||||||
|
{
|
||||||
|
"matches": [
|
||||||
|
"<all_urls>"
|
||||||
|
],
|
||||||
|
"js": [
|
||||||
|
"content.js"
|
||||||
|
],
|
||||||
|
"run_at": "document_idle",
|
||||||
|
"all_frames": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"browser_specific_settings": {
|
||||||
|
"gecko": {
|
||||||
|
"id": "page-content-sender@example.com"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -82,9 +82,3 @@ def html_to_markdown(url: str) -> str:
|
|||||||
|
|
||||||
# Clean up excessive whitespace
|
# Clean up excessive whitespace
|
||||||
return clean_whitespace(markdown)
|
return clean_whitespace(markdown)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# Example usage
|
|
||||||
url = "https://reddit.com"
|
|
||||||
markdown_content = html_to_markdown(url)
|
|
||||||
print(markdown_content)
|
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
fastapi==0.109.2
|
fastapi
|
||||||
uvicorn==0.27.1
|
uvicorn
|
||||||
sqlalchemy==2.0.27
|
sqlalchemy
|
||||||
browser-history==0.4.1
|
browser-history
|
||||||
aiohttp==3.9.3
|
beautifulsoup4
|
||||||
beautifulsoup4==4.12.3
|
markdownify
|
||||||
httpx==0.27.0
|
pyyaml
|
||||||
markdownify==0.11.6
|
pytz
|
||||||
|
websockets==11.0.3
|
||||||
|
iso8601==2.1.0
|
||||||
Reference in New Issue
Block a user