dealspace/lib/scrape.go

package lib

import (
	"encoding/json"
	"fmt"
	"html"
	"io"
	"log"
	"net/http"
	"strings"
	"time"
)

func decodeHTMLEntities(s string) string {
	return html.UnescapeString(s)
}

// ScrapedOrg is the structured result from scraping an organization's website.
type ScrapedOrg struct {
	Name        string `json:"name"`
	Domain      string `json:"domain"`
	Logo        string `json:"logo,omitempty"` // URL to company logo
	Description string `json:"description,omitempty"`
	Industry    string `json:"industry,omitempty"`
	Website     string `json:"website"`
	Phone       string `json:"phone,omitempty"`
	Fax         string `json:"fax,omitempty"`
	Address     string `json:"address,omitempty"`
	City        string `json:"city,omitempty"`
	State       string `json:"state,omitempty"`
	Country     string `json:"country,omitempty"`
	Founded     string `json:"founded,omitempty"`
	LinkedIn    string `json:"linkedin,omitempty"`

	People []ScrapedPerson `json:"people,omitempty"`
}

// ScrapedPerson is a person found on the organization's website.
type ScrapedPerson struct {
	Name     string `json:"name"`
	Email    string `json:"email,omitempty"`
	Title    string `json:"title,omitempty"`
	Phone    string `json:"phone,omitempty"`
	Photo    string `json:"photo,omitempty"` // URL to headshot
	Bio      string `json:"bio,omitempty"`
	LinkedIn string `json:"linkedin,omitempty"`
}

const scrapeModel = "google/gemini-2.0-flash-001"

// ScrapeOrgByEmail takes an email address, extracts the domain,
// fetches the website, and uses an LLM to extract org + people data.
func ScrapeOrgByEmail(apiKey, email string) (*ScrapedOrg, error) {
	parts := strings.SplitN(email, "@", 2)
	if len(parts) != 2 {
		return nil, fmt.Errorf("invalid email: %s", email)
	}
	domain := parts[1]
	return ScrapeOrg(apiKey, domain)
}

// ScrapeOrg fetches a domain's website and extracts structured org + people data.
// Two-pass approach:
//  1. Fetch homepage → ask LLM which pages have team/about/contact info
//  2. Fetch those pages → ask LLM to extract structured data
func ScrapeOrg(apiKey, domain string) (*ScrapedOrg, error) {
	// Pass 1: fetch homepage
	base := "https://" + domain
	homepage := fetchPage(base)
	if homepage == "" {
		return nil, fmt.Errorf("could not fetch %s", base)
	}

	// Ask LLM to find relevant pages using sanitized prompt
	discoverInstructions := fmt.Sprintf(`You are analyzing the HTML of %s to find pages that contain:
1. Team / leadership / people / staff pages (with bios, headshots, names)
2. About / company info pages
3. Contact / office address pages

Look at the navigation, footer, and links in the HTML. Return a JSON array of up to 10 absolute URLs that are most likely to contain team members and company info. Only include URLs on the same domain (%s). Do not include the homepage itself.

Return ONLY a JSON array of strings, no markdown:
["https://%s/about", "https://%s/team", ...]

If you cannot find any relevant links, return an empty array: []`, domain, domain, domain, domain)

	discoverPrompt := BuildSafeScrapePrompt(discoverInstructions, homepage, domain, 50000)

	discoverMessages := []map[string]interface{}{
		{"role": "user", "content": discoverPrompt},
	}

	linksRaw, err := CallOpenRouter(apiKey, scrapeModel, discoverMessages, 1024)
	if err != nil {
		log.Printf("scrape discover error for %s: %v", domain, err)
		linksRaw = "[]"
	}

	var links []string
	if err := json.Unmarshal([]byte(linksRaw), &links); err != nil {
		log.Printf("scrape discover parse error for %s: %v (raw: %.200s)", domain, err, linksRaw)
		links = nil
	}

	// Fetch discovered pages in parallel
	var allHTML strings.Builder
	allHTML.WriteString(fmt.Sprintf("\n<!-- PAGE: %s -->\n", base))
	allHTML.WriteString(homepage)

	if len(links) > 0 {
		extra := fetchPages(links)
		allHTML.WriteString(extra)
	}

	html := allHTML.String()

	// Pass 2: extract structured data using sanitized prompt
	extractInstructions := fmt.Sprintf(`Extract structured data from this company website. Domain: %s

RULES:
- Extract EVERY person mentioned — do not skip anyone
- Every person should have a "title" (job title / role) if one exists. Look at section headings, CSS classes, surrounding text. Common patterns: "Co-Founder", "Partner", "Managing Director", "Principal", "Investment Professional", "Operating Partner", "Operations Manager", "Finance & Operations", "Analyst", "Associate". If a person is under a heading like "Investment Professionals", their title is "Investment Professional". If no title can be determined, leave the title field empty — NEVER use generic placeholders like "Team Member" or "Staff".
- Photo/logo URLs must be fully qualified (https://...)
- Logo: find the company logo image — look for img tags in the header, navbar, or footer with "logo" in the src/alt/class. Return the full absolute URL.
- Address: put ONLY the street address in "address" (e.g. "2151 Central Avenue"). Put city, state, country in their own fields. Do NOT combine them.
- If you can infer emails from a pattern (e.g. firstname@%s), include them
- Bio: 1-2 sentences about their professional background, not personal hobbies
- Return at most 25 people. Prioritize leadership, partners, principals, and senior staff over junior employees, interns, or support staff

Return a single JSON object:
{
  "name": "Company Name",
  "domain": "%s",
  "logo": "https://full-url-to-logo.png",
  "description": "1-2 sentence description",
  "industry": "sector",
  "website": "https://%s",
  "phone": "",
  "fax": "",
  "address": "street address only",
  "city": "",
  "state": "",
  "country": "",
  "founded": "year",
  "linkedin": "url",
  "people": [
    {
      "name": "Full Name",
      "email": "email@domain",
      "title": "Job Title",
      "phone": "direct phone",
      "photo": "https://full-url-to-headshot.jpg",
      "bio": "1-2 sentences",
      "linkedin": "url"
    }
  ]
}

Return ONLY valid JSON — no markdown, no explanation. All text values must be clean plain text — decode any HTML entities (e.g. &#8217; → ', &amp; → &).`, domain, domain, domain, domain)

	prompt := BuildSafeScrapePrompt(extractInstructions, html, domain, 50000)

	messages := []map[string]interface{}{
		{"role": "user", "content": prompt},
	}

	raw, err := CallOpenRouter(apiKey, scrapeModel, messages, 8192)
	if err != nil {
		return nil, fmt.Errorf("llm scrape: %w", err)
	}

	var result ScrapedOrg
	if err := json.Unmarshal([]byte(raw), &result); err != nil {
		return nil, fmt.Errorf("parse llm response: %w (raw: %.500s)", err, raw)
	}

	result.Domain = domain
	if result.Website == "" {
		result.Website = "https://" + domain
	}

	// Clean HTML entities from text fields
	result.Name = decodeHTMLEntities(result.Name)
	result.Description = decodeHTMLEntities(result.Description)
	for i := range result.People {
		result.People[i].Name = decodeHTMLEntities(result.People[i].Name)
		result.People[i].Title = decodeHTMLEntities(result.People[i].Title)
		result.People[i].Bio = decodeHTMLEntities(result.People[i].Bio)
	}

	return &result, nil
}

// fetchPage fetches a single URL and returns its HTML body (or "" on error).
func fetchPage(url string) string {
	client := &http.Client{Timeout: 10 * time.Second}
	req, err := http.NewRequest("GET", url, nil)
	if err != nil {
		return ""
	}
	req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Dealspace/1.0)")
	resp, err := client.Do(req)
	if err != nil || resp.StatusCode != 200 {
		if resp != nil {
			resp.Body.Close()
		}
		return ""
	}
	defer resp.Body.Close()
	body, _ := io.ReadAll(io.LimitReader(resp.Body, 500*1024))
	return string(body)
}

// fetchPages fetches multiple URLs concurrently and concatenates their raw HTML.
// Skips pages that return errors or non-200 status.
func fetchPages(urls []string) string {
	type result struct {
		idx  int
		url  string
		body string
	}

	ch := make(chan result, len(urls))
	client := &http.Client{Timeout: 10 * time.Second}

	for i, u := range urls {
		go func(idx int, url string) {
			req, err := http.NewRequest("GET", url, nil)
			if err != nil {
				ch <- result{idx, url, ""}
				return
			}
			req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Dealspace/1.0)")
			resp, err := client.Do(req)
			if err != nil || resp.StatusCode != 200 {
				if resp != nil {
					resp.Body.Close()
				}
				ch <- result{idx, url, ""}
				return
			}
			defer resp.Body.Close()
			body, _ := io.ReadAll(io.LimitReader(resp.Body, 500*1024))
			ch <- result{idx, url, string(body)}
		}(i, u)
	}

	results := make([]result, len(urls))
	for range urls {
		r := <-ch
		results[r.idx] = r
	}

	var sb strings.Builder
	for _, r := range results {
		if r.body != "" {
			sb.WriteString(fmt.Sprintf("\n<!-- PAGE: %s -->\n", r.url))
			sb.WriteString(r.body)
		}
	}
	return sb.String()
}