package lib import ( "encoding/json" "fmt" "io" "log" "net/http" "strings" "time" ) // ScrapedOrg is the structured result from scraping an organization's website. type ScrapedOrg struct { Name string `json:"name"` Domain string `json:"domain"` Logo string `json:"logo,omitempty"` // URL to company logo Description string `json:"description,omitempty"` Industry string `json:"industry,omitempty"` Website string `json:"website"` Phone string `json:"phone,omitempty"` Fax string `json:"fax,omitempty"` Address string `json:"address,omitempty"` City string `json:"city,omitempty"` State string `json:"state,omitempty"` Country string `json:"country,omitempty"` Founded string `json:"founded,omitempty"` LinkedIn string `json:"linkedin,omitempty"` People []ScrapedPerson `json:"people,omitempty"` } // ScrapedPerson is a person found on the organization's website. type ScrapedPerson struct { Name string `json:"name"` Email string `json:"email,omitempty"` Title string `json:"title,omitempty"` Phone string `json:"phone,omitempty"` Photo string `json:"photo,omitempty"` // URL to headshot Bio string `json:"bio,omitempty"` LinkedIn string `json:"linkedin,omitempty"` } const scrapeModel = "google/gemini-2.0-flash-001" // ScrapeOrgByEmail takes an email address, extracts the domain, // fetches the website, and uses an LLM to extract org + people data. func ScrapeOrgByEmail(apiKey, email string) (*ScrapedOrg, error) { parts := strings.SplitN(email, "@", 2) if len(parts) != 2 { return nil, fmt.Errorf("invalid email: %s", email) } domain := parts[1] return ScrapeOrg(apiKey, domain) } // ScrapeOrg fetches a domain's website and extracts structured org + people data. // Two-pass approach: // 1. Fetch homepage → ask LLM which pages have team/about/contact info // 2. Fetch those pages → ask LLM to extract structured data func ScrapeOrg(apiKey, domain string) (*ScrapedOrg, error) { // Pass 1: fetch homepage base := "https://" + domain homepage := fetchPage(base) if homepage == "" { return nil, fmt.Errorf("could not fetch %s", base) } // Ask LLM to find relevant pages discoverPrompt := fmt.Sprintf(`You are analyzing the HTML of %s to find pages that contain: 1. Team / leadership / people / staff pages (with bios, headshots, names) 2. About / company info pages 3. Contact / office address pages Look at the navigation, footer, and links in the HTML. Return a JSON array of up to 10 absolute URLs that are most likely to contain team members and company info. Only include URLs on the same domain (%s). Do not include the homepage itself. Return ONLY a JSON array of strings, no markdown: ["https://%s/about", "https://%s/team", ...] If you cannot find any relevant links, return an empty array: [] HTML: %s`, domain, domain, domain, domain, homepage) discoverMessages := []map[string]interface{}{ {"role": "user", "content": discoverPrompt}, } linksRaw, err := CallOpenRouter(apiKey, scrapeModel, discoverMessages, 1024) if err != nil { log.Printf("scrape discover error for %s: %v", domain, err) linksRaw = "[]" } var links []string if err := json.Unmarshal([]byte(linksRaw), &links); err != nil { log.Printf("scrape discover parse error for %s: %v (raw: %.200s)", domain, err, linksRaw) links = nil } // Fetch discovered pages in parallel var allHTML strings.Builder allHTML.WriteString(fmt.Sprintf("\n\n", base)) allHTML.WriteString(homepage) if len(links) > 0 { extra := fetchPages(links) allHTML.WriteString(extra) } html := allHTML.String() // Pass 2: extract structured data prompt := fmt.Sprintf(`Extract structured data from this company website. Domain: %s RULES: - Extract EVERY person mentioned — do not skip anyone - Every person MUST have a "title" (job title / role). Look at section headings, CSS classes, surrounding text to determine titles. Common patterns: "Co-Founder", "Partner", "Managing Director", "Principal", "Investment Professional", "Operating Partner", "Operations Manager", "Finance & Operations", "Analyst", "Associate". If a person is under a heading like "Investment Professionals", their title is "Investment Professional". Never use generic "Team Member". - Photo/logo URLs must be fully qualified (https://...) - Logo: find the company logo image — look for img tags in the header, navbar, or footer with "logo" in the src/alt/class. Return the full absolute URL. - Address: put ONLY the street address in "address" (e.g. "2151 Central Avenue"). Put city, state, country in their own fields. Do NOT combine them. - If you can infer emails from a pattern (e.g. firstname@%s), include them - Bio: 1-2 sentences about their professional background, not personal hobbies - Return at most 25 people. Prioritize leadership, partners, principals, and senior staff over junior employees, interns, or support staff Return a single JSON object: { "name": "Company Name", "domain": "%s", "logo": "https://full-url-to-logo.png", "description": "1-2 sentence description", "industry": "sector", "website": "https://%s", "phone": "", "fax": "", "address": "street address only", "city": "", "state": "", "country": "", "founded": "year", "linkedin": "url", "people": [ { "name": "Full Name", "email": "email@domain", "title": "Job Title", "phone": "direct phone", "photo": "https://full-url-to-headshot.jpg", "bio": "1-2 sentences", "linkedin": "url" } ] } Return ONLY valid JSON — no markdown, no explanation. HTML: %s`, domain, domain, domain, domain, html) messages := []map[string]interface{}{ {"role": "user", "content": prompt}, } raw, err := CallOpenRouter(apiKey, scrapeModel, messages, 8192) if err != nil { return nil, fmt.Errorf("llm scrape: %w", err) } var result ScrapedOrg if err := json.Unmarshal([]byte(raw), &result); err != nil { return nil, fmt.Errorf("parse llm response: %w (raw: %.500s)", err, raw) } result.Domain = domain if result.Website == "" { result.Website = "https://" + domain } return &result, nil } // fetchPage fetches a single URL and returns its HTML body (or "" on error). func fetchPage(url string) string { client := &http.Client{Timeout: 10 * time.Second} req, err := http.NewRequest("GET", url, nil) if err != nil { return "" } req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Dealspace/1.0)") resp, err := client.Do(req) if err != nil || resp.StatusCode != 200 { if resp != nil { resp.Body.Close() } return "" } defer resp.Body.Close() body, _ := io.ReadAll(io.LimitReader(resp.Body, 500*1024)) return string(body) } // fetchPages fetches multiple URLs concurrently and concatenates their raw HTML. // Skips pages that return errors or non-200 status. func fetchPages(urls []string) string { type result struct { idx int url string body string } ch := make(chan result, len(urls)) client := &http.Client{Timeout: 10 * time.Second} for i, u := range urls { go func(idx int, url string) { req, err := http.NewRequest("GET", url, nil) if err != nil { ch <- result{idx, url, ""} return } req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Dealspace/1.0)") resp, err := client.Do(req) if err != nil || resp.StatusCode != 200 { if resp != nil { resp.Body.Close() } ch <- result{idx, url, ""} return } defer resp.Body.Close() body, _ := io.ReadAll(io.LimitReader(resp.Body, 500*1024)) ch <- result{idx, url, string(body)} }(i, u) } results := make([]result, len(urls)) for range urls { r := <-ch results[r.idx] = r } var sb strings.Builder for _, r := range results { if r.body != "" { sb.WriteString(fmt.Sprintf("\n\n", r.url)) sb.WriteString(r.body) } } return sb.String() }