mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 10:29:38 +00:00
Fix exclusion
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import yaml
|
||||
from fnmatch import fnmatch
|
||||
from urllib.parse import urlparse
|
||||
|
||||
class DomainExclusions:
|
||||
def __init__(self, config_path="config/history_config.yaml"):
|
||||
@@ -14,42 +15,43 @@ class DomainExclusions:
|
||||
|
||||
# Handle both direct list and dict with 'excluded_domains' key
|
||||
if isinstance(config, list):
|
||||
self.excluded_domains = config
|
||||
loaded_patterns = config
|
||||
elif isinstance(config, dict):
|
||||
loaded_patterns = config.get('excluded_domains', [])
|
||||
else:
|
||||
self.excluded_domains = config.get('excluded_domains', [])
|
||||
loaded_patterns = [] # Handle other invalid config types
|
||||
|
||||
# Basic validation/cleaning of patterns
|
||||
self.excluded_domains = [
|
||||
str(p).strip() for p in loaded_patterns if p and isinstance(p, str)
|
||||
]
|
||||
# Optional: Warn if some patterns were ignored
|
||||
# if len(self.excluded_domains) != len(loaded_patterns):
|
||||
# print(f"Warning: Some invalid patterns were ignored in {config_path}")
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
|
||||
self.excluded_domains = [] # Ensure it's empty on error
|
||||
except yaml.YAMLError as e:
|
||||
print(f"Error parsing YAML configuration: {e}")
|
||||
self.excluded_domains = []
|
||||
except Exception as e: # Catch other potential errors
|
||||
print(f"An unexpected error occurred during config loading: {e}")
|
||||
self.excluded_domains = []
|
||||
|
||||
def is_excluded(self, domain):
|
||||
"""
|
||||
Check if a domain matches any of the excluded domain patterns.
|
||||
"""
|
||||
# Strip protocol (http:// or https://) if present
|
||||
domain = domain.lower().strip('/')
|
||||
if '://' in domain:
|
||||
domain = domain.split('://', 1)[1]
|
||||
def is_excluded(self, url_string):
|
||||
if not url_string or not isinstance(url_string, str):
|
||||
return True
|
||||
|
||||
# Strip query parameters if present
|
||||
if '?' in domain:
|
||||
domain = domain.split('?', 1)[0]
|
||||
input_url = url_string.strip()
|
||||
|
||||
# Split domain and path
|
||||
if '/' in domain:
|
||||
domain = domain.split('/', 1)[0]
|
||||
# If the url starts with www, remove it
|
||||
if input_url.startswith('www.'):
|
||||
input_url = input_url[4:]
|
||||
|
||||
for pattern in self.excluded_domains:
|
||||
pattern = pattern.lower().strip('/')
|
||||
if '/' in pattern:
|
||||
pattern = pattern.split('/', 1)[0]
|
||||
|
||||
# Remove trailing wildcard if present
|
||||
if pattern.endswith('*'):
|
||||
pattern = pattern.rstrip('*').rstrip('.')
|
||||
|
||||
# Use fnmatch for proper wildcard pattern matching
|
||||
if fnmatch(domain, pattern):
|
||||
if pattern in input_url:
|
||||
return True
|
||||
|
||||
# If no patterns matched
|
||||
return False
|
||||
Reference in New Issue
Block a user