Fix exclusion

This commit is contained in:
2025-04-12 15:03:58 -05:00
parent f723a56090
commit 832dde2ae0
3 changed files with 40 additions and 39 deletions

View File

@@ -1,5 +1,6 @@
import yaml
from fnmatch import fnmatch
from urllib.parse import urlparse
class DomainExclusions:
def __init__(self, config_path="config/history_config.yaml"):
@@ -14,42 +15,43 @@ class DomainExclusions:
# Handle both direct list and dict with 'excluded_domains' key
if isinstance(config, list):
self.excluded_domains = config
loaded_patterns = config
elif isinstance(config, dict):
loaded_patterns = config.get('excluded_domains', [])
else:
self.excluded_domains = config.get('excluded_domains', [])
loaded_patterns = [] # Handle other invalid config types
# Basic validation/cleaning of patterns
self.excluded_domains = [
str(p).strip() for p in loaded_patterns if p and isinstance(p, str)
]
# Optional: Warn if some patterns were ignored
# if len(self.excluded_domains) != len(loaded_patterns):
# print(f"Warning: Some invalid patterns were ignored in {config_path}")
except FileNotFoundError:
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
self.excluded_domains = [] # Ensure it's empty on error
except yaml.YAMLError as e:
print(f"Error parsing YAML configuration: {e}")
self.excluded_domains = []
except Exception as e: # Catch other potential errors
print(f"An unexpected error occurred during config loading: {e}")
self.excluded_domains = []
def is_excluded(self, domain):
"""
Check if a domain matches any of the excluded domain patterns.
"""
# Strip protocol (http:// or https://) if present
domain = domain.lower().strip('/')
if '://' in domain:
domain = domain.split('://', 1)[1]
def is_excluded(self, url_string):
if not url_string or not isinstance(url_string, str):
return True
# Strip query parameters if present
if '?' in domain:
domain = domain.split('?', 1)[0]
input_url = url_string.strip()
# Split domain and path
if '/' in domain:
domain = domain.split('/', 1)[0]
# If the url starts with www, remove it
if input_url.startswith('www.'):
input_url = input_url[4:]
for pattern in self.excluded_domains:
pattern = pattern.lower().strip('/')
if '/' in pattern:
pattern = pattern.split('/', 1)[0]
# Remove trailing wildcard if present
if pattern.endswith('*'):
pattern = pattern.rstrip('*').rstrip('.')
# Use fnmatch for proper wildcard pattern matching
if fnmatch(domain, pattern):
if pattern in input_url:
return True
# If no patterns matched
return False