All the things

This commit is contained in:
2025-01-25 22:42:04 -06:00
parent d556823350
commit 0db1065d10
16 changed files with 678 additions and 55 deletions

6
.env.example Normal file
View File

@@ -0,0 +1,6 @@
# Meilisearch Configuration
MEILISEARCH_HOST=http://localhost:7700
# Generate a master key using: openssl rand -hex 32
MEILISEARCH_MASTER_KEY=your_master_key_here
# Example master key: 6d99b335033595ea62d02a5641b94e04e80c33c1e1f1f789c84445ff5

1
app/__init__.py Normal file
View File

@@ -0,0 +1 @@
# This file can be empty, it just marks the directory as a Python package

92
app/config.py Normal file
View File

@@ -0,0 +1,92 @@
import yaml
from pathlib import Path
from typing import Set
import fnmatch
class ReaderConfig:
def __init__(self):
self.excluded_patterns: Set[str] = set()
self._load_config()
def _load_config(self):
config_path = Path("config/reader_config.yaml")
if not config_path.exists():
print("Warning: reader_config.yaml not found, creating default config")
self._create_default_config(config_path)
try:
with open(config_path, 'r') as f:
config = yaml.safe_load(f)
self.excluded_patterns = set(config.get('excluded_domains', []))
except Exception as e:
print(f"Error loading config: {e}")
self.excluded_patterns = set()
def _create_default_config(self, config_path: Path):
config_path.parent.mkdir(parents=True, exist_ok=True)
default_config = {
'excluded_domains': [
'localhost',
'127.0.0.1',
'192.168.*.*',
'10.*.*.*'
]
}
with open(config_path, 'w') as f:
yaml.safe_dump(default_config, f, default_flow_style=False)
def is_domain_excluded(self, domain: str) -> bool:
"""
Check if a domain matches any exclusion pattern.
Supports glob-style wildcards (* and ?)
Examples:
- '*.example.com' matches any subdomain of example.com
- 'reddit-*.com' matches reddit-video.com, reddit-static.com, etc.
- '192.168.*.*' matches any IP in the 192.168.0.0/16 subnet
"""
domain = domain.lower()
# Check each pattern
for pattern in self.excluded_patterns:
pattern = pattern.lower()
# Handle IP address patterns specially
if any(c.isdigit() for c in pattern):
if self._match_ip_pattern(domain, pattern):
return True
# Handle domain patterns
if fnmatch.fnmatch(domain, pattern):
return True
# Also check if the pattern matches when prepended with a dot
# This handles cases like 'example.com' matching 'subdomain.example.com'
if fnmatch.fnmatch(domain, f"*.{pattern}"):
return True
return False
def _match_ip_pattern(self, domain: str, pattern: str) -> bool:
"""
Special handling for IP address patterns.
Handles cases like '192.168.*.*' matching '192.168.1.1'
"""
# Skip if domain isn't IP-like
if not any(c.isdigit() for c in domain):
return False
# Split into octets
domain_parts = domain.split('.')
pattern_parts = pattern.split('.')
# Must have same number of parts
if len(domain_parts) != len(pattern_parts):
return False
# Check each octet
for domain_part, pattern_part in zip(domain_parts, pattern_parts):
if pattern_part == '*':
continue
if domain_part != pattern_part:
return False
return True

View File

@@ -1,6 +1,7 @@
from sqlalchemy import create_engine, Column, Integer, String, DateTime from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from datetime import datetime
SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db" SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db"
@@ -10,13 +11,15 @@ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base() Base = declarative_base()
class HistoryEntry(Base): class HistoryEntry(Base):
__tablename__ = "history_entries" __tablename__ = "history"
id = Column(Integer, primary_key=True, index=True) id = Column(Integer, primary_key=True)
url = Column(String, index=True) url = Column(String)
title = Column(String, nullable=True) title = Column(String)
visit_time = Column(DateTime, index=True) visit_time = Column(DateTime)
domain = Column(String, index=True) domain = Column(String)
markdown_content = Column(Text, nullable=True)
last_content_update = Column(DateTime, nullable=True)
class Bookmark(Base): class Bookmark(Base):
__tablename__ = "bookmarks" __tablename__ = "bookmarks"
@@ -28,6 +31,37 @@ class Bookmark(Base):
folder = Column(String, index=True) folder = Column(String, index=True)
domain = Column(String, index=True) domain = Column(String, index=True)
class BlacklistedDomain(Base):
__tablename__ = "blacklisted_domains"
id = Column(Integer, primary_key=True)
domain = Column(String, unique=True, index=True)
reason = Column(String, nullable=True)
added_time = Column(DateTime, default=datetime.utcnow)
@classmethod
def is_blacklisted(cls, db: SessionLocal, domain: str) -> bool:
"""Check if a domain is blacklisted"""
return db.query(cls).filter(cls.domain == domain.lower()).first() is not None
@classmethod
def add_to_blacklist(cls, db: SessionLocal, domain: str, reason: str = None):
"""Add a domain to the blacklist"""
try:
blacklist_entry = cls(
domain=domain.lower(),
reason=reason
)
db.add(blacklist_entry)
db.commit()
except:
db.rollback()
# If entry already exists, just update the reason
existing = db.query(cls).filter(cls.domain == domain.lower()).first()
if existing and reason:
existing.reason = reason
db.commit()
Base.metadata.create_all(bind=engine) Base.metadata.create_all(bind=engine)
def get_db(): def get_db():

View File

@@ -1,15 +1,31 @@
from fastapi import FastAPI, Depends, Query from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from datetime import datetime from datetime import datetime, timezone
from typing import List from typing import List, Optional
import asyncio import asyncio
from fastapi import WebSocketDisconnect
from urllib.parse import urlparse
import pytz
from fastapi.middleware.cors import CORSMiddleware
import iso8601
from .database import get_db, HistoryEntry, Bookmark from .database import get_db, HistoryEntry, Bookmark
from .scheduler import HistoryScheduler from .scheduler import HistoryScheduler
from .page_info import PageInfo
from .page_reader import PageReader
app = FastAPI() app = FastAPI()
scheduler = HistoryScheduler() scheduler = HistoryScheduler()
# Add CORS middleware to allow WebSocket connections
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # In production, specify your domains
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.on_event("startup") @app.on_event("startup")
async def startup_event(): async def startup_event():
# Initial bookmark fetch # Initial bookmark fetch
@@ -17,12 +33,37 @@ async def startup_event():
# Start the background task # Start the background task
asyncio.create_task(scheduler.update_history()) asyncio.create_task(scheduler.update_history())
def serialize_history_entry(entry, include_content: bool = False):
"""Serialize a HistoryEntry object to a dictionary"""
result = {
"id": entry.id,
"url": entry.url,
"title": entry.title,
"visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
"domain": entry.domain,
}
if include_content:
result["markdown_content"] = entry.markdown_content
return result
def serialize_bookmark(bookmark):
"""Serialize a Bookmark object to a dictionary"""
return {
"id": bookmark.id,
"url": bookmark.url,
"title": bookmark.title,
"added_time": bookmark.added_time.isoformat() if bookmark.added_time else None,
"folder": bookmark.folder,
"domain": bookmark.domain,
}
@app.get("/history/search") @app.get("/history/search")
async def search_history( async def search_history(
domain: str = Query(None), domain: Optional[str] = Query(None),
start_date: datetime = Query(None), start_date: Optional[datetime] = Query(None),
end_date: datetime = Query(None), end_date: Optional[datetime] = Query(None),
search_term: str = Query(None), search_term: Optional[str] = Query(None),
include_content: bool = Query(False),
db: Session = Depends(get_db) db: Session = Depends(get_db)
): ):
query = db.query(HistoryEntry) query = db.query(HistoryEntry)
@@ -37,15 +78,19 @@ async def search_history(
query = query.filter(HistoryEntry.visit_time <= end_date) query = query.filter(HistoryEntry.visit_time <= end_date)
if search_term: if search_term:
query = query.filter(HistoryEntry.title.ilike(f"%{search_term}%")) query = query.filter(
(HistoryEntry.title.ilike(f"%{search_term}%")) |
(HistoryEntry.markdown_content.ilike(f"%{search_term}%"))
)
return query.all() entries = query.all()
return [serialize_history_entry(entry, include_content) for entry in entries]
@app.get("/bookmarks/search") @app.get("/bookmarks/search")
async def search_bookmarks( async def search_bookmarks(
domain: str = Query(None), domain: Optional[str] = Query(None),
folder: str = Query(None), folder: Optional[str] = Query(None),
search_term: str = Query(None), search_term: Optional[str] = Query(None),
db: Session = Depends(get_db) db: Session = Depends(get_db)
): ):
query = db.query(Bookmark) query = db.query(Bookmark)
@@ -59,4 +104,73 @@ async def search_bookmarks(
if search_term: if search_term:
query = query.filter(Bookmark.title.ilike(f"%{search_term}%")) query = query.filter(Bookmark.title.ilike(f"%{search_term}%"))
return query.all() bookmarks = query.all()
return [serialize_bookmark(bookmark) for bookmark in bookmarks]
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
print("WebSocket endpoint called")
page_reader = PageReader()
print("New WebSocket connection established")
await websocket.accept()
print("WebSocket connection accepted")
try:
while True:
print("Waiting for message...")
data = await websocket.receive_json()
print(f"Received message for URL: {data['url']}")
print(f"HTML content length: {len(data['html'])}")
print(f"Timestamp: {data['timestamp']}")
# Parse the ISO timestamp correctly
timestamp = iso8601.parse_date(data['timestamp'])
page_info = PageInfo(
url=data['url'],
html=data['html'],
timestamp=timestamp
)
print(f"Created PageInfo object for: {page_info.url}")
# Convert HTML to markdown
print("Converting HTML to markdown...")
markdown_content = page_reader.html_to_markdown(page_info.html)
print(f"Markdown conversion complete, length: {len(markdown_content) if markdown_content else 0}")
# Update or create history entry
domain = urlparse(page_info.url).netloc
print(f"Creating history entry for domain: {domain}")
history_entry = HistoryEntry(
url=page_info.url,
visit_time=page_info.timestamp,
domain=domain,
markdown_content=markdown_content,
last_content_update=datetime.now(timezone.utc)
)
print("Saving to database...")
db.add(history_entry)
db.commit()
print("Database save complete")
# Send confirmation back to client
await websocket.send_json({
"status": "success",
"message": f"Processed page: {page_info.url}"
})
except WebSocketDisconnect:
print("Client disconnected")
except Exception as e:
print(f"Error handling message: {e}")
# Send error back to client if possible
try:
await websocket.send_json({
"status": "error",
"message": str(e)
})
except:
pass
finally:
print("Cleaning up resources")
page_reader.close()

View File

@@ -1,16 +1,8 @@
import asyncio from dataclasses import dataclass
import aiohttp from datetime import datetime
from bs4 import BeautifulSoup
from typing import Optional
class PageInfoFetcher: @dataclass
async def get_page_title(self, url: str) -> Optional[str]: class PageInfo:
try: url: str
async with aiohttp.ClientSession() as session: html: str
async with session.get(url, timeout=5) as response: timestamp: datetime
if response.status == 200:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
return soup.title.string if soup.title else None
except:
return None

101
app/page_reader.py Normal file
View File

@@ -0,0 +1,101 @@
import re
from markdownify import markdownify as md
from bs4 import BeautifulSoup
from typing import Optional
from urllib.parse import urlparse
from .config import ReaderConfig
import logging
from .database import SessionLocal, BlacklistedDomain
# Setup logging with less verbose output
logging.basicConfig(
level=logging.WARNING,
format='%(levelname)s: %(message)s'
)
logger = logging.getLogger(__name__)
# Patterns for cleaning
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
META_PATTERN = r"<[ ]*meta.*?>"
COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
LINK_PATTERN = r"<[ ]*link.*?>"
BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
class PageReader:
def __init__(self):
self.config = ReaderConfig()
self.db = SessionLocal()
def clean_html(self, html: str) -> str:
"""Clean HTML by removing unwanted elements and patterns."""
if not html:
return ""
# First use regex to remove problematic patterns
html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(BASE64_IMG_PATTERN, "", html)
try:
# Use BeautifulSoup to remove additional elements we want to strip
soup = BeautifulSoup(html, 'html.parser')
# Remove unwanted elements
elements_to_remove = [
'canvas', 'img', 'picture', 'audio', 'video',
'iframe', 'embed', 'object', 'param', 'track',
'map', 'area', 'source'
]
for element in elements_to_remove:
for tag in soup.find_all(element):
tag.decompose()
return str(soup)
except Exception as e:
logger.error(f"Error cleaning HTML: {e}")
return ""
def clean_whitespace(self, text: str) -> str:
"""Clean excessive whitespace from text."""
if not text:
return ""
try:
# Replace 3 or more newlines with 2 newlines
cleaned = re.sub(r'\n{3,}', '\n\n', text)
# Remove trailing whitespace from each line
cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines())
return cleaned.strip()
except Exception as e:
logger.error(f"Error cleaning whitespace: {e}")
return ""
def html_to_markdown(self, html: str) -> Optional[str]:
"""Convert HTML to markdown."""
try:
cleaned_html = self.clean_html(html)
if not cleaned_html:
return None
return self.clean_whitespace(md(cleaned_html,
heading_style="ATX",
bullets="-",
autolinks=True,
strip=['form'],
escape_asterisks=True,
escape_underscores=True))
except Exception as e:
logger.error(f"Error converting to markdown: {e}")
return None
def close(self):
"""Cleanup resources"""
self.db.close()

View File

@@ -3,14 +3,29 @@ from datetime import datetime, timedelta
import asyncio import asyncio
from .database import SessionLocal, HistoryEntry, Bookmark from .database import SessionLocal, HistoryEntry, Bookmark
from .browser import BrowserHistoryCollector from .browser import BrowserHistoryCollector
from .page_info import PageInfoFetcher from .page_reader import PageReader
from sqlalchemy import func from sqlalchemy import func
from sqlalchemy.orm import Session
import pytz
class HistoryScheduler: class HistoryScheduler:
def __init__(self): def __init__(self):
self.browser_collector = BrowserHistoryCollector() self.browser_collector = BrowserHistoryCollector()
self.page_fetcher = PageInfoFetcher() self.page_reader = PageReader()
self.last_history_update = None self.last_history_update = None
self.content_update_interval = timedelta(hours=24) # Update content daily
def _normalize_datetime(self, dt: datetime) -> datetime:
"""Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
if dt is None:
return None
# If datetime is naive (no timezone), assume it's in UTC
if dt.tzinfo is None:
return pytz.UTC.localize(dt)
# If datetime has timezone, convert to UTC
return dt.astimezone(pytz.UTC)
async def update_bookmarks(self): async def update_bookmarks(self):
bookmarks = self.browser_collector.fetch_bookmarks() bookmarks = self.browser_collector.fetch_bookmarks()
@@ -26,6 +41,9 @@ class HistoryScheduler:
new_entries = [] new_entries = []
for added_time, url, title, folder in bookmarks: for added_time, url, title, folder in bookmarks:
# Normalize the datetime
added_time = self._normalize_datetime(added_time)
# Only add if URL doesn't exist or if it's in a different folder # Only add if URL doesn't exist or if it's in a different folder
if (url not in existing_urls or if (url not in existing_urls or
existing_urls[url][1] != folder): existing_urls[url][1] != folder):
@@ -51,6 +69,8 @@ class HistoryScheduler:
try: try:
# Get the latest timestamp from our database # Get the latest timestamp from our database
latest_entry = db.query(func.max(HistoryEntry.visit_time)).scalar() latest_entry = db.query(func.max(HistoryEntry.visit_time)).scalar()
if latest_entry:
latest_entry = self._normalize_datetime(latest_entry)
# Fetch new history # Fetch new history
history = self.browser_collector.fetch_history() history = self.browser_collector.fetch_history()
@@ -58,11 +78,11 @@ class HistoryScheduler:
# Filter to only get entries newer than our latest entry # Filter to only get entries newer than our latest entry
new_entries = [] new_entries = []
for visit_time, url, title in history: for visit_time, url, title in history:
# Normalize the datetime
visit_time = self._normalize_datetime(visit_time)
if not latest_entry or visit_time > latest_entry: if not latest_entry or visit_time > latest_entry:
domain = self.browser_collector.get_domain(url) domain = self.browser_collector.get_domain(url)
if not title:
title = await self.page_fetcher.get_page_title(url)
entry = HistoryEntry( entry = HistoryEntry(
url=url, url=url,
title=title, title=title,
@@ -83,3 +103,7 @@ class HistoryScheduler:
# Wait for 5 minutes before next update # Wait for 5 minutes before next update
await asyncio.sleep(300) await asyncio.sleep(300)
async def close(self):
"""Cleanup resources"""
await self.page_reader.close()

33
app/websocket_server.py Normal file
View File

@@ -0,0 +1,33 @@
import asyncio
import websockets
import json
from page_info import PageInfo
from datetime import datetime
async def handle_websocket(websocket, path):
try:
async for message in websocket:
data = json.loads(message)
page_info = PageInfo(
url=data['url'],
html=data['html'],
timestamp=datetime.fromisoformat(data['timestamp'])
)
print(f"Received page content from: {page_info.url}")
# Here you can process the page_info object as needed
except websockets.exceptions.ConnectionClosed:
print("Client disconnected")
except Exception as e:
print(f"Error handling message: {e}")
async def start_server():
server = await websockets.serve(handle_websocket, "localhost", 8765)
print("WebSocket server started on ws://localhost:8765")
await server.wait_closed()
def run_server():
asyncio.run(start_server())
if __name__ == "__main__":
run_server()

15
config/reader_config.yaml Normal file
View File

@@ -0,0 +1,15 @@
# Domains to exclude from content reading
excluded_domains:
# Local sites
- localhost
- 127.0.0.1
# IP ranges
- 192.168.*.*
- 10.*.*.*
- 172.16.*.*
# Example wildcard patterns
# - *.local
# - reddit-*.com
# - *.githubusercontent.com

47
extension/background.js Normal file
View File

@@ -0,0 +1,47 @@
console.log("Background script loaded");
async function isContentScriptReady(tabId) {
try {
await browser.tabs.sendMessage(tabId, { type: "PING" });
return true;
} catch (error) {
return false;
}
}
async function waitForContentScript(tabId, maxAttempts = 10) {
console.log(`Waiting for content script in tab ${tabId}`);
for (let i = 0; i < maxAttempts; i++) {
if (await isContentScriptReady(tabId)) {
console.log(`Content script ready in tab ${tabId}`);
return true;
}
console.log(`Attempt ${i + 1}: Content script not ready, waiting...`);
await new Promise(resolve => setTimeout(resolve, 500));
}
console.log(`Content script not ready after ${maxAttempts} attempts`);
return false;
}
async function sendMessageToTab(tabId) {
try {
console.log(`Checking content script status for tab ${tabId}`);
if (await waitForContentScript(tabId)) {
console.log(`Sending GET_PAGE_CONTENT message to tab ${tabId}`);
await browser.tabs.sendMessage(tabId, {
type: "GET_PAGE_CONTENT"
});
console.log(`Successfully sent message to tab ${tabId}`);
}
} catch (error) {
console.error(`Error sending message to tab ${tabId}:`, error);
}
}
browser.webNavigation.onCompleted.addListener(async (details) => {
console.log("Navigation completed", details);
if (details.frameId === 0) { // Only handle main frame navigation
console.log(`Main frame navigation detected for tab ${details.tabId}`);
await sendMessageToTab(details.tabId);
}
});

132
extension/content.js Normal file
View File

@@ -0,0 +1,132 @@
console.log("Content script starting initialization...");
// Function to log WebSocket state
function getWebSocketState(ws) {
const states = {
0: 'CONNECTING',
1: 'OPEN',
2: 'CLOSING',
3: 'CLOSED'
};
return states[ws.readyState] || 'UNKNOWN';
}
class WebSocketClient {
constructor() {
console.log("WebSocketClient constructor called");
this.messageQueue = [];
this.connect();
this.reconnectAttempts = 0;
this.maxReconnectAttempts = 5;
}
connect() {
console.log('Attempting to connect to WebSocket server...');
try {
this.ws = new WebSocket('ws://localhost:8523/ws');
console.log('WebSocket instance created');
this.ws.addEventListener('open', () => {
console.log('WebSocket connection opened successfully');
this.reconnectAttempts = 0;
// Process any queued messages
this.processQueue();
});
this.ws.addEventListener('error', (event) => {
console.error('WebSocket error occurred:', event);
});
this.ws.addEventListener('close', (event) => {
console.log('WebSocket connection closed:', event.code, event.reason);
this.tryReconnect();
});
this.ws.addEventListener('message', (event) => {
console.log('Received message from server:', event.data);
});
} catch (error) {
console.error('Error creating WebSocket:', error);
}
}
processQueue() {
console.log(`Processing message queue (${this.messageQueue.length} messages)`);
while (this.messageQueue.length > 0) {
const data = this.messageQueue.shift();
this.sendMessage(data);
}
}
tryReconnect() {
if (this.reconnectAttempts < this.maxReconnectAttempts) {
this.reconnectAttempts++;
console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`);
setTimeout(() => this.connect(), 2000 * this.reconnectAttempts);
} else {
console.log('Max reconnection attempts reached');
}
}
sendMessage(data) {
console.log('sendMessage called, WebSocket state:', getWebSocketState(this.ws));
if (this.ws.readyState === WebSocket.OPEN) {
try {
console.log('Preparing to send data:', {
url: data.url,
timestamp: data.timestamp,
htmlLength: data.html.length
});
this.ws.send(JSON.stringify(data));
console.log('Data sent successfully');
return true;
} catch (error) {
console.error('Error sending data:', error);
return false;
}
} else {
console.log('WebSocket not ready, queueing message');
this.messageQueue.push(data);
return true;
}
}
}
console.log("Creating WebSocketClient instance...");
const wsClient = new WebSocketClient();
console.log("Setting up message listener...");
browser.runtime.onMessage.addListener((message, sender, sendResponse) => {
console.log('Message received from background script:', message);
if (message.type === "PING") {
console.log('Received PING, responding...');
return Promise.resolve({ status: "ready" });
}
if (message.type === "GET_PAGE_CONTENT") {
console.log('Processing GET_PAGE_CONTENT message');
const pageContent = {
url: window.location.href,
html: document.documentElement.outerHTML,
timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
};
console.log('Created page content object for:', pageContent.url);
wsClient.sendMessage(pageContent);
}
return true;
});
// Send initial page content
console.log('Sending initial page content...');
const pageContent = {
url: window.location.href,
html: document.documentElement.outerHTML,
timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
};
wsClient.sendMessage(pageContent);
console.log("Content script initialization complete for:", window.location.href);

35
extension/manifest.json Normal file
View File

@@ -0,0 +1,35 @@
{
"manifest_version": 2,
"name": "Page Content Sender",
"version": "1.0",
"description": "Sends page content via WebSocket when a page loads",
"permissions": [
"webNavigation",
"activeTab",
"<all_urls>",
"tabs"
],
"background": {
"scripts": [
"background.js"
],
"persistent": true
},
"content_scripts": [
{
"matches": [
"<all_urls>"
],
"js": [
"content.js"
],
"run_at": "document_idle",
"all_frames": false
}
],
"browser_specific_settings": {
"gecko": {
"id": "page-content-sender@example.com"
}
}
}

View File

@@ -82,9 +82,3 @@ def html_to_markdown(url: str) -> str:
# Clean up excessive whitespace # Clean up excessive whitespace
return clean_whitespace(markdown) return clean_whitespace(markdown)
if __name__ == "__main__":
# Example usage
url = "https://reddit.com"
markdown_content = html_to_markdown(url)
print(markdown_content)

View File

@@ -1,8 +1,10 @@
fastapi==0.109.2 fastapi
uvicorn==0.27.1 uvicorn
sqlalchemy==2.0.27 sqlalchemy
browser-history==0.4.1 browser-history
aiohttp==3.9.3 beautifulsoup4
beautifulsoup4==4.12.3 markdownify
httpx==0.27.0 pyyaml
markdownify==0.11.6 pytz
websockets==11.0.3
iso8601==2.1.0

1
terminal Normal file
View File

@@ -0,0 +1 @@
rm app/websocket_server.py