import re
from markdownify import markdownify as md
from bs4 import BeautifulSoup
from typing import Optional
from urllib.parse import urlparse
from .config import ReaderConfig
import logging
from .database import SessionLocal, BlacklistedDomain
# Setup logging with less verbose output
logging.basicConfig(
level=logging.WARNING,
format='%(levelname)s: %(message)s'
)
logger = logging.getLogger(__name__)
# Patterns for cleaning
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
META_PATTERN = r"<[ ]*meta.*?>"
COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
LINK_PATTERN = r"<[ ]*link.*?>"
BASE64_IMG_PATTERN = r'
]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
SVG_PATTERN = r"(