mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
Historical records use original timestamp
This commit is contained in:
@@ -1,9 +1,10 @@
|
|||||||
from typing import Tuple
|
from typing import Tuple, Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from database import Database
|
from database import Database
|
||||||
from domain_exclusions import DomainExclusions
|
from domain_exclusions import DomainExclusions
|
||||||
from logger import Logger
|
from logger import Logger
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
class BaseCrawler:
|
class BaseCrawler:
|
||||||
def __init__(self, db: Database, domain_exclusions: DomainExclusions, logger: Logger):
|
def __init__(self, db: Database, domain_exclusions: DomainExclusions, logger: Logger):
|
||||||
@@ -34,7 +35,7 @@ class BaseCrawler:
|
|||||||
|
|
||||||
return False, ""
|
return False, ""
|
||||||
|
|
||||||
async def crawl_url(self, url: str, default_title: str = None) -> Tuple[bool, dict]:
|
async def crawl_url(self, url: str, default_title: str = None, created_timestamp: Optional[datetime] = None) -> Tuple[bool, dict]:
|
||||||
try:
|
try:
|
||||||
result = await self.crawler.arun(url=url)
|
result = await self.crawler.arun(url=url)
|
||||||
crawl_result = result[0]
|
crawl_result = result[0]
|
||||||
@@ -44,7 +45,8 @@ class BaseCrawler:
|
|||||||
self.db.add_history(
|
self.db.add_history(
|
||||||
url=url,
|
url=url,
|
||||||
title=title,
|
title=title,
|
||||||
content=content
|
content=content,
|
||||||
|
created_timestamp=created_timestamp
|
||||||
)
|
)
|
||||||
|
|
||||||
return True, {
|
return True, {
|
||||||
|
|||||||
@@ -59,14 +59,15 @@ class Database:
|
|||||||
|
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
|
|
||||||
def add_history(self, url: str, title: str, content: str) -> int:
|
def add_history(self, url: str, title: str, content: str, created_timestamp: Optional[datetime] = None) -> int:
|
||||||
"""Add a new history entry."""
|
"""Add a new history entry."""
|
||||||
now = datetime.utcnow()
|
now = datetime.utcnow()
|
||||||
|
created_time = created_timestamp if created_timestamp else now
|
||||||
with self._lock:
|
with self._lock:
|
||||||
self.cursor.execute('''
|
self.cursor.execute('''
|
||||||
INSERT INTO history (url, title, content, created, updated)
|
INSERT INTO history (url, title, content, created, updated)
|
||||||
VALUES (?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?)
|
||||||
''', (url, title, content, now, now))
|
''', (url, title, content, created_time, now))
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
return self.cursor.lastrowid
|
return self.cursor.lastrowid
|
||||||
|
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ class HistoryCrawler(BaseCrawler):
|
|||||||
self.logger.info(f"Skipping URL from history: {url} ({skip_reason})")
|
self.logger.info(f"Skipping URL from history: {url} ({skip_reason})")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
success, result = await self.crawl_url(url, title)
|
success, result = await self.crawl_url(url, title, created_timestamp=timestamp)
|
||||||
if success:
|
if success:
|
||||||
self.logger.info(f"Processed historical URL: {url}")
|
self.logger.info(f"Processed historical URL: {url}")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user