mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
Historical records use original timestamp
This commit is contained in:
@@ -1,9 +1,10 @@
|
||||
from typing import Tuple
|
||||
from typing import Tuple, Optional
|
||||
from urllib.parse import urlparse
|
||||
from database import Database
|
||||
from domain_exclusions import DomainExclusions
|
||||
from logger import Logger
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from datetime import datetime
|
||||
|
||||
class BaseCrawler:
|
||||
def __init__(self, db: Database, domain_exclusions: DomainExclusions, logger: Logger):
|
||||
@@ -34,7 +35,7 @@ class BaseCrawler:
|
||||
|
||||
return False, ""
|
||||
|
||||
async def crawl_url(self, url: str, default_title: str = None) -> Tuple[bool, dict]:
|
||||
async def crawl_url(self, url: str, default_title: str = None, created_timestamp: Optional[datetime] = None) -> Tuple[bool, dict]:
|
||||
try:
|
||||
result = await self.crawler.arun(url=url)
|
||||
crawl_result = result[0]
|
||||
@@ -44,7 +45,8 @@ class BaseCrawler:
|
||||
self.db.add_history(
|
||||
url=url,
|
||||
title=title,
|
||||
content=content
|
||||
content=content,
|
||||
created_timestamp=created_timestamp
|
||||
)
|
||||
|
||||
return True, {
|
||||
|
||||
@@ -59,14 +59,15 @@ class Database:
|
||||
|
||||
self.conn.commit()
|
||||
|
||||
def add_history(self, url: str, title: str, content: str) -> int:
|
||||
def add_history(self, url: str, title: str, content: str, created_timestamp: Optional[datetime] = None) -> int:
|
||||
"""Add a new history entry."""
|
||||
now = datetime.utcnow()
|
||||
created_time = created_timestamp if created_timestamp else now
|
||||
with self._lock:
|
||||
self.cursor.execute('''
|
||||
INSERT INTO history (url, title, content, created, updated)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
''', (url, title, content, now, now))
|
||||
''', (url, title, content, created_time, now))
|
||||
self.conn.commit()
|
||||
return self.cursor.lastrowid
|
||||
|
||||
|
||||
@@ -63,7 +63,7 @@ class HistoryCrawler(BaseCrawler):
|
||||
self.logger.info(f"Skipping URL from history: {url} ({skip_reason})")
|
||||
continue
|
||||
|
||||
success, result = await self.crawl_url(url, title)
|
||||
success, result = await self.crawl_url(url, title, created_timestamp=timestamp)
|
||||
if success:
|
||||
self.logger.info(f"Processed historical URL: {url}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user