mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
More domain exclusion
This commit is contained in:
@@ -6,7 +6,7 @@ excluded_domains:
|
||||
- "namecheap.com"
|
||||
- "us.ovhcloud.com"
|
||||
- "linode.com"
|
||||
- "github.com/settings/"
|
||||
- "github.com/settings"
|
||||
- "secure.backblaze.com"
|
||||
- "login."
|
||||
- "identity."
|
||||
|
||||
@@ -8,4 +8,4 @@ source (dirname (status filename))/.venv/bin/activate.fish
|
||||
python src/main.py > /dev/null 2>&1 &
|
||||
|
||||
# Print a simple confirmation message
|
||||
echo "Browser Recall started in background"
|
||||
echo "Browser Recall started in background with PID $!"
|
||||
@@ -24,11 +24,9 @@ class BaseCrawler:
|
||||
if url.startswith("about:") or url.startswith("chrome:"):
|
||||
return True, "Browser internal URL"
|
||||
|
||||
domain = urlparse(url).netloc
|
||||
|
||||
# Check domain exclusions
|
||||
if self.domain_exclusions.is_excluded(domain):
|
||||
return True, "Excluded domain"
|
||||
# Check domain exclusions using the full URL, not just the domain
|
||||
if self.domain_exclusions.is_excluded(url):
|
||||
return True, "Excluded domain/path"
|
||||
|
||||
# Check if URL exists
|
||||
if self.db.url_exists(url):
|
||||
|
||||
@@ -41,17 +41,74 @@ class DomainExclusions:
|
||||
|
||||
def is_excluded(self, url_string):
|
||||
if not url_string or not isinstance(url_string, str):
|
||||
return True
|
||||
return True # Exclude invalid URLs
|
||||
|
||||
input_url = url_string.strip()
|
||||
input_url_stripped = url_string.strip()
|
||||
|
||||
# If the url starts with www, remove it
|
||||
if input_url.startswith('www.'):
|
||||
input_url = input_url[4:]
|
||||
try:
|
||||
parsed_url = urlparse(input_url_stripped)
|
||||
domain = parsed_url.netloc
|
||||
|
||||
for pattern in self.excluded_domains:
|
||||
if pattern in input_url:
|
||||
return True
|
||||
# Basic check: if domain itself is empty (can happen with file:// URLs etc.)
|
||||
if not domain:
|
||||
return True # Exclude URLs without a domain
|
||||
|
||||
# Combine domain and path for path-specific exclusions
|
||||
path = parsed_url.path if parsed_url.path else ''
|
||||
# Ensure path starts with / if it exists and isn't empty, handle root case
|
||||
if not path.startswith('/') and path:
|
||||
path = '/' + path
|
||||
elif not path:
|
||||
path = '/' # Represent root path explicitly for matching
|
||||
domain_and_path = domain + path
|
||||
|
||||
|
||||
for pattern in self.excluded_domains:
|
||||
# 1. Check for path-specific patterns first (more specific)
|
||||
# Use startswith for patterns like "github.com/settings"
|
||||
# Ensure pattern doesn't end with '/' unless path is just '/'
|
||||
if '/' in pattern:
|
||||
# Normalize pattern ending for comparison
|
||||
normalized_pattern = pattern.rstrip('/')
|
||||
normalized_domain_path = domain_and_path.rstrip('/')
|
||||
# Handle root path case explicitly
|
||||
if normalized_pattern == domain and path == '/':
|
||||
# print(f"DEBUG: URL '{url_string}' excluded by root path pattern '{pattern}'")
|
||||
return True
|
||||
if normalized_domain_path.startswith(normalized_pattern) and normalized_pattern != domain:
|
||||
# print(f"DEBUG: URL '{url_string}' excluded by path pattern '{pattern}' matching '{normalized_domain_path}'")
|
||||
return True
|
||||
continue # Don't check domain ending if it was a path pattern
|
||||
|
||||
# 2. Check if the domain ends with the pattern (handles subdomains)
|
||||
# Also check for exact match.
|
||||
# Example: domain "ap.www.namecheap.com" ends with pattern "namecheap.com"
|
||||
# Example: domain "localhost" matches pattern "localhost"
|
||||
# Add '.' prefix for endswith check to avoid partial matches like 'example.com' matching 'ample.com'
|
||||
pattern_for_endswith = '.' + pattern if not pattern.startswith('.') else pattern
|
||||
domain_for_endswith = '.' + domain
|
||||
|
||||
if domain == pattern or domain_for_endswith.endswith(pattern_for_endswith):
|
||||
# print(f"DEBUG: URL '{url_string}' excluded by domain pattern '{pattern}' matching domain '{domain}'")
|
||||
return True
|
||||
|
||||
# 3. Check for patterns intended to match anywhere (like "login.", ".auth.")
|
||||
# This is less precise but matches the original intent of some patterns.
|
||||
# Check within the domain part only.
|
||||
if pattern.startswith('.') or pattern.endswith('.'):
|
||||
if pattern in domain:
|
||||
# print(f"DEBUG: URL '{url_string}' excluded by substring pattern '{pattern}' in domain '{domain}'")
|
||||
return True
|
||||
|
||||
|
||||
except ValueError:
|
||||
# Handle potential errors from urlparse on malformed URLs
|
||||
print(f"Warning: Could not parse URL '{url_string}' for exclusion check.")
|
||||
return True # Exclude unparseable URLs
|
||||
except Exception as e:
|
||||
# Log other errors during URL parsing or checking
|
||||
print(f"Warning: Error processing URL '{url_string}' for exclusion: {e}")
|
||||
return True # Exclude URLs that cause errors during processing
|
||||
|
||||
# If no patterns matched
|
||||
return False
|
||||
Reference in New Issue
Block a user