mail-agent/src/actions/unsubscribe.py

"""Unsubscribe action handler."""
import logging
import re
from typing import Optional
from urllib.parse import urlparse

import httpx

from ..models import Message

logger = logging.getLogger(__name__)


def find_unsubscribe_link(message: Message) -> Optional[str]:
    """Find unsubscribe link in an email message.

    Checks:
    1. List-Unsubscribe header (TODO: needs raw headers)
    2. HTML body for common unsubscribe patterns
    3. Text body for unsubscribe URLs
    """
    # Search patterns
    patterns = [
        r'href=["\']?(https?://[^"\'>\s]*unsubscribe[^"\'>\s]*)["\']?',
        r'href=["\']?(https?://[^"\'>\s]*optout[^"\'>\s]*)["\']?',
        r'href=["\']?(https?://[^"\'>\s]*opt-out[^"\'>\s]*)["\']?',
        r'href=["\']?(https?://[^"\'>\s]*remove[^"\'>\s]*)["\']?',
        r'(https?://[^\s<>"]*unsubscribe[^\s<>"]*)',
        r'(https?://[^\s<>"]*optout[^\s<>"]*)',
    ]

    # Search in HTML body first
    if message.body_html:
        for pattern in patterns:
            matches = re.findall(pattern, message.body_html, re.IGNORECASE)
            if matches:
                url = matches[0]
                if _is_valid_unsubscribe_url(url):
                    return url

    # Search in text body
    if message.body_text:
        for pattern in patterns:
            matches = re.findall(pattern, message.body_text, re.IGNORECASE)
            if matches:
                url = matches[0]
                if _is_valid_unsubscribe_url(url):
                    return url

    return None


def _is_valid_unsubscribe_url(url: str) -> bool:
    """Validate that a URL looks like a legitimate unsubscribe link."""
    try:
        parsed = urlparse(url)

        # Must be HTTP(S)
        if parsed.scheme not in ("http", "https"):
            return False

        # Must have a host
        if not parsed.netloc:
            return False

        # Reject obvious non-unsubscribe URLs
        suspicious = ["login", "password", "account", "download"]
        for term in suspicious:
            if term in url.lower() and "unsubscribe" not in url.lower():
                return False

        return True
    except Exception:
        return False


async def execute_unsubscribe(url: str) -> tuple[bool, str]:
    """Execute an unsubscribe action by visiting the URL.

    Returns (success, message).
    """
    try:
        async with httpx.AsyncClient(
            timeout=30.0,
            follow_redirects=True,
            headers={
                "User-Agent": "Mozilla/5.0 (compatible; MailAgent/1.0)",
            },
        ) as client:
            response = await client.get(url)

            # Check for success indicators
            if response.status_code == 200:
                content = response.text.lower()

                # Look for success messages
                success_indicators = [
                    "unsubscribed",
                    "removed",
                    "successfully",
                    "you have been",
                    "no longer",
                ]

                for indicator in success_indicators:
                    if indicator in content:
                        logger.info(f"Unsubscribe successful: {url}")
                        return True, "Successfully unsubscribed"

                # If we got 200 but no clear success message, assume it worked
                # (many unsubscribe pages just say "done" or redirect)
                logger.info(f"Unsubscribe completed (no confirmation): {url}")
                return True, "Unsubscribe request sent"

            else:
                logger.warning(f"Unsubscribe failed: {response.status_code} for {url}")
                return False, f"HTTP {response.status_code}"

    except httpx.TimeoutException:
        logger.error(f"Unsubscribe timeout: {url}")
        return False, "Request timed out"

    except Exception as e:
        logger.error(f"Unsubscribe error: {e}")
        return False, str(e)