dealspace/lib/scrape.go

263 lines
8.6 KiB
Go

package lib
import (
"encoding/json"
"fmt"
"html"
"io"
"log"
"net/http"
"strings"
"time"
)
func decodeHTMLEntities(s string) string {
return html.UnescapeString(s)
}
// ScrapedOrg is the structured result from scraping an organization's website.
type ScrapedOrg struct {
Name string `json:"name"`
Domain string `json:"domain"`
Logo string `json:"logo,omitempty"` // URL to company logo
Description string `json:"description,omitempty"`
Industry string `json:"industry,omitempty"`
Website string `json:"website"`
Phone string `json:"phone,omitempty"`
Fax string `json:"fax,omitempty"`
Address string `json:"address,omitempty"`
City string `json:"city,omitempty"`
State string `json:"state,omitempty"`
Country string `json:"country,omitempty"`
Founded string `json:"founded,omitempty"`
LinkedIn string `json:"linkedin,omitempty"`
People []ScrapedPerson `json:"people,omitempty"`
}
// ScrapedPerson is a person found on the organization's website.
type ScrapedPerson struct {
Name string `json:"name"`
Email string `json:"email,omitempty"`
Title string `json:"title,omitempty"`
Phone string `json:"phone,omitempty"`
Photo string `json:"photo,omitempty"` // URL to headshot
Bio string `json:"bio,omitempty"`
LinkedIn string `json:"linkedin,omitempty"`
}
const scrapeModel = "google/gemini-2.0-flash-001"
// ScrapeOrgByEmail takes an email address, extracts the domain,
// fetches the website, and uses an LLM to extract org + people data.
func ScrapeOrgByEmail(apiKey, email string) (*ScrapedOrg, error) {
parts := strings.SplitN(email, "@", 2)
if len(parts) != 2 {
return nil, fmt.Errorf("invalid email: %s", email)
}
domain := parts[1]
return ScrapeOrg(apiKey, domain)
}
// ScrapeOrg fetches a domain's website and extracts structured org + people data.
// Two-pass approach:
// 1. Fetch homepage → ask LLM which pages have team/about/contact info
// 2. Fetch those pages → ask LLM to extract structured data
func ScrapeOrg(apiKey, domain string) (*ScrapedOrg, error) {
// Pass 1: fetch homepage
base := "https://" + domain
homepage := fetchPage(base)
if homepage == "" {
return nil, fmt.Errorf("could not fetch %s", base)
}
// Ask LLM to find relevant pages using sanitized prompt
discoverInstructions := fmt.Sprintf(`You are analyzing the HTML of %s to find pages that contain:
1. Team / leadership / people / staff pages (with bios, headshots, names)
2. About / company info pages
3. Contact / office address pages
Look at the navigation, footer, and links in the HTML. Return a JSON array of up to 10 absolute URLs that are most likely to contain team members and company info. Only include URLs on the same domain (%s). Do not include the homepage itself.
Return ONLY a JSON array of strings, no markdown:
["https://%s/about", "https://%s/team", ...]
If you cannot find any relevant links, return an empty array: []`, domain, domain, domain, domain)
discoverPrompt := BuildSafeScrapePrompt(discoverInstructions, homepage, domain, 50000)
discoverMessages := []map[string]interface{}{
{"role": "user", "content": discoverPrompt},
}
linksRaw, err := CallOpenRouter(apiKey, scrapeModel, discoverMessages, 1024)
if err != nil {
log.Printf("scrape discover error for %s: %v", domain, err)
linksRaw = "[]"
}
var links []string
if err := json.Unmarshal([]byte(linksRaw), &links); err != nil {
log.Printf("scrape discover parse error for %s: %v (raw: %.200s)", domain, err, linksRaw)
links = nil
}
// Fetch discovered pages in parallel
var allHTML strings.Builder
allHTML.WriteString(fmt.Sprintf("\n<!-- PAGE: %s -->\n", base))
allHTML.WriteString(homepage)
if len(links) > 0 {
extra := fetchPages(links)
allHTML.WriteString(extra)
}
html := allHTML.String()
// Pass 2: extract structured data using sanitized prompt
extractInstructions := fmt.Sprintf(`Extract structured data from this company website. Domain: %s
RULES:
- Extract EVERY person mentioned — do not skip anyone
- Every person should have a "title" (job title / role) if one exists. Look at section headings, CSS classes, surrounding text. Common patterns: "Co-Founder", "Partner", "Managing Director", "Principal", "Investment Professional", "Operating Partner", "Operations Manager", "Finance & Operations", "Analyst", "Associate". If a person is under a heading like "Investment Professionals", their title is "Investment Professional". If no title can be determined, leave the title field empty — NEVER use generic placeholders like "Team Member" or "Staff".
- Photo/logo URLs must be fully qualified (https://...)
- Logo: find the company logo image — look for img tags in the header, navbar, or footer with "logo" in the src/alt/class. Return the full absolute URL.
- Address: put ONLY the street address in "address" (e.g. "2151 Central Avenue"). Put city, state, country in their own fields. Do NOT combine them.
- If you can infer emails from a pattern (e.g. firstname@%s), include them
- Bio: 1-2 sentences about their professional background, not personal hobbies
- Return at most 25 people. Prioritize leadership, partners, principals, and senior staff over junior employees, interns, or support staff
Return a single JSON object:
{
"name": "Company Name",
"domain": "%s",
"logo": "https://full-url-to-logo.png",
"description": "1-2 sentence description",
"industry": "sector",
"website": "https://%s",
"phone": "",
"fax": "",
"address": "street address only",
"city": "",
"state": "",
"country": "",
"founded": "year",
"linkedin": "url",
"people": [
{
"name": "Full Name",
"email": "email@domain",
"title": "Job Title",
"phone": "direct phone",
"photo": "https://full-url-to-headshot.jpg",
"bio": "1-2 sentences",
"linkedin": "url"
}
]
}
Return ONLY valid JSON — no markdown, no explanation. All text values must be clean plain text — decode any HTML entities (e.g. &#8217; → ', &amp; → &).`, domain, domain, domain, domain)
prompt := BuildSafeScrapePrompt(extractInstructions, html, domain, 50000)
messages := []map[string]interface{}{
{"role": "user", "content": prompt},
}
raw, err := CallOpenRouter(apiKey, scrapeModel, messages, 8192)
if err != nil {
return nil, fmt.Errorf("llm scrape: %w", err)
}
var result ScrapedOrg
if err := json.Unmarshal([]byte(raw), &result); err != nil {
return nil, fmt.Errorf("parse llm response: %w (raw: %.500s)", err, raw)
}
result.Domain = domain
if result.Website == "" {
result.Website = "https://" + domain
}
// Clean HTML entities from text fields
result.Name = decodeHTMLEntities(result.Name)
result.Description = decodeHTMLEntities(result.Description)
for i := range result.People {
result.People[i].Name = decodeHTMLEntities(result.People[i].Name)
result.People[i].Title = decodeHTMLEntities(result.People[i].Title)
result.People[i].Bio = decodeHTMLEntities(result.People[i].Bio)
}
return &result, nil
}
// fetchPage fetches a single URL and returns its HTML body (or "" on error).
func fetchPage(url string) string {
client := &http.Client{Timeout: 10 * time.Second}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return ""
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Dealspace/1.0)")
resp, err := client.Do(req)
if err != nil || resp.StatusCode != 200 {
if resp != nil {
resp.Body.Close()
}
return ""
}
defer resp.Body.Close()
body, _ := io.ReadAll(io.LimitReader(resp.Body, 500*1024))
return string(body)
}
// fetchPages fetches multiple URLs concurrently and concatenates their raw HTML.
// Skips pages that return errors or non-200 status.
func fetchPages(urls []string) string {
type result struct {
idx int
url string
body string
}
ch := make(chan result, len(urls))
client := &http.Client{Timeout: 10 * time.Second}
for i, u := range urls {
go func(idx int, url string) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
ch <- result{idx, url, ""}
return
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Dealspace/1.0)")
resp, err := client.Do(req)
if err != nil || resp.StatusCode != 200 {
if resp != nil {
resp.Body.Close()
}
ch <- result{idx, url, ""}
return
}
defer resp.Body.Close()
body, _ := io.ReadAll(io.LimitReader(resp.Body, 500*1024))
ch <- result{idx, url, string(body)}
}(i, u)
}
results := make([]result, len(urls))
for range urls {
r := <-ch
results[r.idx] = r
}
var sb strings.Builder
for _, r := range results {
if r.body != "" {
sb.WriteString(fmt.Sprintf("\n<!-- PAGE: %s -->\n", r.url))
sb.WriteString(r.body)
}
}
return sb.String()
}