mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
114 lines
5.6 KiB
Python
114 lines
5.6 KiB
Python
import yaml
|
|
from fnmatch import fnmatch
|
|
from urllib.parse import urlparse
|
|
|
|
class DomainExclusions:
|
|
def __init__(self, config_path="config/history_config.yaml"):
|
|
self.excluded_domains = []
|
|
self.load_config(config_path)
|
|
|
|
def load_config(self, config_path):
|
|
"""Load excluded domains from the YAML configuration file."""
|
|
try:
|
|
with open(config_path, 'r') as f:
|
|
config = yaml.safe_load(f)
|
|
|
|
# Handle both direct list and dict with 'excluded_domains' key
|
|
if isinstance(config, list):
|
|
loaded_patterns = config
|
|
elif isinstance(config, dict):
|
|
loaded_patterns = config.get('excluded_domains', [])
|
|
else:
|
|
loaded_patterns = [] # Handle other invalid config types
|
|
|
|
# Basic validation/cleaning of patterns
|
|
self.excluded_domains = [
|
|
str(p).strip() for p in loaded_patterns if p and isinstance(p, str)
|
|
]
|
|
# Optional: Warn if some patterns were ignored
|
|
# if len(self.excluded_domains) != len(loaded_patterns):
|
|
# print(f"Warning: Some invalid patterns were ignored in {config_path}")
|
|
|
|
except FileNotFoundError:
|
|
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
|
|
self.excluded_domains = [] # Ensure it's empty on error
|
|
except yaml.YAMLError as e:
|
|
print(f"Error parsing YAML configuration: {e}")
|
|
self.excluded_domains = []
|
|
except Exception as e: # Catch other potential errors
|
|
print(f"An unexpected error occurred during config loading: {e}")
|
|
self.excluded_domains = []
|
|
|
|
def is_excluded(self, url_string):
|
|
if not url_string or not isinstance(url_string, str):
|
|
return True # Exclude invalid URLs
|
|
|
|
input_url_stripped = url_string.strip()
|
|
|
|
try:
|
|
parsed_url = urlparse(input_url_stripped)
|
|
domain = parsed_url.netloc
|
|
|
|
# Basic check: if domain itself is empty (can happen with file:// URLs etc.)
|
|
if not domain:
|
|
return True # Exclude URLs without a domain
|
|
|
|
# Combine domain and path for path-specific exclusions
|
|
path = parsed_url.path if parsed_url.path else ''
|
|
# Ensure path starts with / if it exists and isn't empty, handle root case
|
|
if not path.startswith('/') and path:
|
|
path = '/' + path
|
|
elif not path:
|
|
path = '/' # Represent root path explicitly for matching
|
|
domain_and_path = domain + path
|
|
|
|
|
|
for pattern in self.excluded_domains:
|
|
# 1. Check for path-specific patterns first (more specific)
|
|
# Use startswith for patterns like "github.com/settings"
|
|
# Ensure pattern doesn't end with '/' unless path is just '/'
|
|
if '/' in pattern:
|
|
# Normalize pattern ending for comparison
|
|
normalized_pattern = pattern.rstrip('/')
|
|
normalized_domain_path = domain_and_path.rstrip('/')
|
|
# Handle root path case explicitly
|
|
if normalized_pattern == domain and path == '/':
|
|
# print(f"DEBUG: URL '{url_string}' excluded by root path pattern '{pattern}'")
|
|
return True
|
|
if normalized_domain_path.startswith(normalized_pattern) and normalized_pattern != domain:
|
|
# print(f"DEBUG: URL '{url_string}' excluded by path pattern '{pattern}' matching '{normalized_domain_path}'")
|
|
return True
|
|
continue # Don't check domain ending if it was a path pattern
|
|
|
|
# 2. Check if the domain ends with the pattern (handles subdomains)
|
|
# Also check for exact match.
|
|
# Example: domain "ap.www.namecheap.com" ends with pattern "namecheap.com"
|
|
# Example: domain "localhost" matches pattern "localhost"
|
|
# Add '.' prefix for endswith check to avoid partial matches like 'example.com' matching 'ample.com'
|
|
pattern_for_endswith = '.' + pattern if not pattern.startswith('.') else pattern
|
|
domain_for_endswith = '.' + domain
|
|
|
|
if domain == pattern or domain_for_endswith.endswith(pattern_for_endswith):
|
|
# print(f"DEBUG: URL '{url_string}' excluded by domain pattern '{pattern}' matching domain '{domain}'")
|
|
return True
|
|
|
|
# 3. Check for patterns intended to match anywhere (like "login.", ".auth.")
|
|
# This is less precise but matches the original intent of some patterns.
|
|
# Check within the domain part only.
|
|
if pattern.startswith('.') or pattern.endswith('.'):
|
|
if pattern in domain:
|
|
# print(f"DEBUG: URL '{url_string}' excluded by substring pattern '{pattern}' in domain '{domain}'")
|
|
return True
|
|
|
|
|
|
except ValueError:
|
|
# Handle potential errors from urlparse on malformed URLs
|
|
print(f"Warning: Could not parse URL '{url_string}' for exclusion check.")
|
|
return True # Exclude unparseable URLs
|
|
except Exception as e:
|
|
# Log other errors during URL parsing or checking
|
|
print(f"Warning: Error processing URL '{url_string}' for exclusion: {e}")
|
|
return True # Exclude URLs that cause errors during processing
|
|
|
|
# If no patterns matched
|
|
return False |