Historical records use original timestamp

This commit is contained in:
2025-04-12 21:13:21 -05:00
parent 93063d2465
commit bfc7f4dd35
3 changed files with 9 additions and 6 deletions

View File

@@ -1,9 +1,10 @@
from typing import Tuple
from typing import Tuple, Optional
from urllib.parse import urlparse
from database import Database
from domain_exclusions import DomainExclusions
from logger import Logger
from crawl4ai import AsyncWebCrawler
from datetime import datetime
class BaseCrawler:
def __init__(self, db: Database, domain_exclusions: DomainExclusions, logger: Logger):
@@ -34,7 +35,7 @@ class BaseCrawler:
return False, ""
async def crawl_url(self, url: str, default_title: str = None) -> Tuple[bool, dict]:
async def crawl_url(self, url: str, default_title: str = None, created_timestamp: Optional[datetime] = None) -> Tuple[bool, dict]:
try:
result = await self.crawler.arun(url=url)
crawl_result = result[0]
@@ -44,7 +45,8 @@ class BaseCrawler:
self.db.add_history(
url=url,
title=title,
content=content
content=content,
created_timestamp=created_timestamp
)
return True, {

View File

@@ -59,14 +59,15 @@ class Database:
self.conn.commit()
def add_history(self, url: str, title: str, content: str) -> int:
def add_history(self, url: str, title: str, content: str, created_timestamp: Optional[datetime] = None) -> int:
"""Add a new history entry."""
now = datetime.utcnow()
created_time = created_timestamp if created_timestamp else now
with self._lock:
self.cursor.execute('''
INSERT INTO history (url, title, content, created, updated)
VALUES (?, ?, ?, ?, ?)
''', (url, title, content, now, now))
''', (url, title, content, created_time, now))
self.conn.commit()
return self.cursor.lastrowid

View File

@@ -63,7 +63,7 @@ class HistoryCrawler(BaseCrawler):
self.logger.info(f"Skipping URL from history: {url} ({skip_reason})")
continue
success, result = await self.crawl_url(url, title)
success, result = await self.crawl_url(url, title, created_timestamp=timestamp)
if success:
self.logger.info(f"Processed historical URL: {url}")