Refactor all the things

This commit is contained in:
2025-04-12 14:41:04 -05:00
parent 90dc188dc8
commit 5629a92acf
5 changed files with 210 additions and 75 deletions

View File

@@ -2,7 +2,7 @@ import yaml
from fnmatch import fnmatch
class DomainExclusions:
def __init__(self, config_path="history_config.yaml"):
def __init__(self, config_path="config/history_config.yaml"):
self.excluded_domains = []
self.load_config(config_path)
@@ -12,8 +12,11 @@ class DomainExclusions:
with open(config_path, 'r') as f:
config = yaml.safe_load(f)
# Get the excluded_domains list from config, defaulting to empty list if not found
self.excluded_domains = config.get('excluded_domains', [])
# Handle both direct list and dict with 'excluded_domains' key
if isinstance(config, list):
self.excluded_domains = config
else:
self.excluded_domains = config.get('excluded_domains', [])
except FileNotFoundError:
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
except yaml.YAMLError as e:
@@ -23,12 +26,30 @@ class DomainExclusions:
def is_excluded(self, domain):
"""
Check if a domain matches any of the excluded domain patterns.
Supports wildcards (*, ?) in the excluded domain patterns.
Args:
domain (str): The domain to check
Returns:
bool: True if the domain should be excluded, False otherwise
"""
return any(fnmatch(domain.lower(), pattern.lower()) for pattern in self.excluded_domains)
# Strip protocol (http:// or https://) if present
domain = domain.lower().strip('/')
if '://' in domain:
domain = domain.split('://', 1)[1]
# Strip query parameters if present
if '?' in domain:
domain = domain.split('?', 1)[0]
# Split domain and path
if '/' in domain:
domain = domain.split('/', 1)[0]
for pattern in self.excluded_domains:
pattern = pattern.lower().strip('/')
if '/' in pattern:
pattern = pattern.split('/', 1)[0]
# Remove trailing wildcard if present
if pattern.endswith('*'):
pattern = pattern.rstrip('*').rstrip('.')
# Use fnmatch for proper wildcard pattern matching
if fnmatch(domain, pattern):
return True
return False