263 lines
8.6 KiB
Go
263 lines
8.6 KiB
Go
package lib
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"html"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
func decodeHTMLEntities(s string) string {
|
|
return html.UnescapeString(s)
|
|
}
|
|
|
|
// ScrapedOrg is the structured result from scraping an organization's website.
|
|
type ScrapedOrg struct {
|
|
Name string `json:"name"`
|
|
Domain string `json:"domain"`
|
|
Logo string `json:"logo,omitempty"` // URL to company logo
|
|
Description string `json:"description,omitempty"`
|
|
Industry string `json:"industry,omitempty"`
|
|
Website string `json:"website"`
|
|
Phone string `json:"phone,omitempty"`
|
|
Fax string `json:"fax,omitempty"`
|
|
Address string `json:"address,omitempty"`
|
|
City string `json:"city,omitempty"`
|
|
State string `json:"state,omitempty"`
|
|
Country string `json:"country,omitempty"`
|
|
Founded string `json:"founded,omitempty"`
|
|
LinkedIn string `json:"linkedin,omitempty"`
|
|
|
|
People []ScrapedPerson `json:"people,omitempty"`
|
|
}
|
|
|
|
// ScrapedPerson is a person found on the organization's website.
|
|
type ScrapedPerson struct {
|
|
Name string `json:"name"`
|
|
Email string `json:"email,omitempty"`
|
|
Title string `json:"title,omitempty"`
|
|
Phone string `json:"phone,omitempty"`
|
|
Photo string `json:"photo,omitempty"` // URL to headshot
|
|
Bio string `json:"bio,omitempty"`
|
|
LinkedIn string `json:"linkedin,omitempty"`
|
|
}
|
|
|
|
const scrapeModel = "google/gemini-2.0-flash-001"
|
|
|
|
// ScrapeOrgByEmail takes an email address, extracts the domain,
|
|
// fetches the website, and uses an LLM to extract org + people data.
|
|
func ScrapeOrgByEmail(apiKey, email string) (*ScrapedOrg, error) {
|
|
parts := strings.SplitN(email, "@", 2)
|
|
if len(parts) != 2 {
|
|
return nil, fmt.Errorf("invalid email: %s", email)
|
|
}
|
|
domain := parts[1]
|
|
return ScrapeOrg(apiKey, domain)
|
|
}
|
|
|
|
// ScrapeOrg fetches a domain's website and extracts structured org + people data.
|
|
// Two-pass approach:
|
|
// 1. Fetch homepage → ask LLM which pages have team/about/contact info
|
|
// 2. Fetch those pages → ask LLM to extract structured data
|
|
func ScrapeOrg(apiKey, domain string) (*ScrapedOrg, error) {
|
|
// Pass 1: fetch homepage
|
|
base := "https://" + domain
|
|
homepage := fetchPage(base)
|
|
if homepage == "" {
|
|
return nil, fmt.Errorf("could not fetch %s", base)
|
|
}
|
|
|
|
// Ask LLM to find relevant pages using sanitized prompt
|
|
discoverInstructions := fmt.Sprintf(`You are analyzing the HTML of %s to find pages that contain:
|
|
1. Team / leadership / people / staff pages (with bios, headshots, names)
|
|
2. About / company info pages
|
|
3. Contact / office address pages
|
|
|
|
Look at the navigation, footer, and links in the HTML. Return a JSON array of up to 10 absolute URLs that are most likely to contain team members and company info. Only include URLs on the same domain (%s). Do not include the homepage itself.
|
|
|
|
Return ONLY a JSON array of strings, no markdown:
|
|
["https://%s/about", "https://%s/team", ...]
|
|
|
|
If you cannot find any relevant links, return an empty array: []`, domain, domain, domain, domain)
|
|
|
|
discoverPrompt := BuildSafeScrapePrompt(discoverInstructions, homepage, domain, 50000)
|
|
|
|
discoverMessages := []map[string]interface{}{
|
|
{"role": "user", "content": discoverPrompt},
|
|
}
|
|
|
|
linksRaw, err := CallOpenRouter(apiKey, scrapeModel, discoverMessages, 1024)
|
|
if err != nil {
|
|
log.Printf("scrape discover error for %s: %v", domain, err)
|
|
linksRaw = "[]"
|
|
}
|
|
|
|
var links []string
|
|
if err := json.Unmarshal([]byte(linksRaw), &links); err != nil {
|
|
log.Printf("scrape discover parse error for %s: %v (raw: %.200s)", domain, err, linksRaw)
|
|
links = nil
|
|
}
|
|
|
|
// Fetch discovered pages in parallel
|
|
var allHTML strings.Builder
|
|
allHTML.WriteString(fmt.Sprintf("\n<!-- PAGE: %s -->\n", base))
|
|
allHTML.WriteString(homepage)
|
|
|
|
if len(links) > 0 {
|
|
extra := fetchPages(links)
|
|
allHTML.WriteString(extra)
|
|
}
|
|
|
|
html := allHTML.String()
|
|
|
|
// Pass 2: extract structured data using sanitized prompt
|
|
extractInstructions := fmt.Sprintf(`Extract structured data from this company website. Domain: %s
|
|
|
|
RULES:
|
|
- Extract EVERY person mentioned — do not skip anyone
|
|
- Every person should have a "title" (job title / role) if one exists. Look at section headings, CSS classes, surrounding text. Common patterns: "Co-Founder", "Partner", "Managing Director", "Principal", "Investment Professional", "Operating Partner", "Operations Manager", "Finance & Operations", "Analyst", "Associate". If a person is under a heading like "Investment Professionals", their title is "Investment Professional". If no title can be determined, leave the title field empty — NEVER use generic placeholders like "Team Member" or "Staff".
|
|
- Photo/logo URLs must be fully qualified (https://...)
|
|
- Logo: find the company logo image — look for img tags in the header, navbar, or footer with "logo" in the src/alt/class. Return the full absolute URL.
|
|
- Address: put ONLY the street address in "address" (e.g. "2151 Central Avenue"). Put city, state, country in their own fields. Do NOT combine them.
|
|
- If you can infer emails from a pattern (e.g. firstname@%s), include them
|
|
- Bio: 1-2 sentences about their professional background, not personal hobbies
|
|
- Return at most 25 people. Prioritize leadership, partners, principals, and senior staff over junior employees, interns, or support staff
|
|
|
|
Return a single JSON object:
|
|
{
|
|
"name": "Company Name",
|
|
"domain": "%s",
|
|
"logo": "https://full-url-to-logo.png",
|
|
"description": "1-2 sentence description",
|
|
"industry": "sector",
|
|
"website": "https://%s",
|
|
"phone": "",
|
|
"fax": "",
|
|
"address": "street address only",
|
|
"city": "",
|
|
"state": "",
|
|
"country": "",
|
|
"founded": "year",
|
|
"linkedin": "url",
|
|
"people": [
|
|
{
|
|
"name": "Full Name",
|
|
"email": "email@domain",
|
|
"title": "Job Title",
|
|
"phone": "direct phone",
|
|
"photo": "https://full-url-to-headshot.jpg",
|
|
"bio": "1-2 sentences",
|
|
"linkedin": "url"
|
|
}
|
|
]
|
|
}
|
|
|
|
Return ONLY valid JSON — no markdown, no explanation. All text values must be clean plain text — decode any HTML entities (e.g. ’ → ', & → &).`, domain, domain, domain, domain)
|
|
|
|
prompt := BuildSafeScrapePrompt(extractInstructions, html, domain, 50000)
|
|
|
|
messages := []map[string]interface{}{
|
|
{"role": "user", "content": prompt},
|
|
}
|
|
|
|
raw, err := CallOpenRouter(apiKey, scrapeModel, messages, 8192)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("llm scrape: %w", err)
|
|
}
|
|
|
|
var result ScrapedOrg
|
|
if err := json.Unmarshal([]byte(raw), &result); err != nil {
|
|
return nil, fmt.Errorf("parse llm response: %w (raw: %.500s)", err, raw)
|
|
}
|
|
|
|
result.Domain = domain
|
|
if result.Website == "" {
|
|
result.Website = "https://" + domain
|
|
}
|
|
|
|
// Clean HTML entities from text fields
|
|
result.Name = decodeHTMLEntities(result.Name)
|
|
result.Description = decodeHTMLEntities(result.Description)
|
|
for i := range result.People {
|
|
result.People[i].Name = decodeHTMLEntities(result.People[i].Name)
|
|
result.People[i].Title = decodeHTMLEntities(result.People[i].Title)
|
|
result.People[i].Bio = decodeHTMLEntities(result.People[i].Bio)
|
|
}
|
|
|
|
return &result, nil
|
|
}
|
|
|
|
// fetchPage fetches a single URL and returns its HTML body (or "" on error).
|
|
func fetchPage(url string) string {
|
|
client := &http.Client{Timeout: 10 * time.Second}
|
|
req, err := http.NewRequest("GET", url, nil)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Dealspace/1.0)")
|
|
resp, err := client.Do(req)
|
|
if err != nil || resp.StatusCode != 200 {
|
|
if resp != nil {
|
|
resp.Body.Close()
|
|
}
|
|
return ""
|
|
}
|
|
defer resp.Body.Close()
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 500*1024))
|
|
return string(body)
|
|
}
|
|
|
|
// fetchPages fetches multiple URLs concurrently and concatenates their raw HTML.
|
|
// Skips pages that return errors or non-200 status.
|
|
func fetchPages(urls []string) string {
|
|
type result struct {
|
|
idx int
|
|
url string
|
|
body string
|
|
}
|
|
|
|
ch := make(chan result, len(urls))
|
|
client := &http.Client{Timeout: 10 * time.Second}
|
|
|
|
for i, u := range urls {
|
|
go func(idx int, url string) {
|
|
req, err := http.NewRequest("GET", url, nil)
|
|
if err != nil {
|
|
ch <- result{idx, url, ""}
|
|
return
|
|
}
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Dealspace/1.0)")
|
|
resp, err := client.Do(req)
|
|
if err != nil || resp.StatusCode != 200 {
|
|
if resp != nil {
|
|
resp.Body.Close()
|
|
}
|
|
ch <- result{idx, url, ""}
|
|
return
|
|
}
|
|
defer resp.Body.Close()
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 500*1024))
|
|
ch <- result{idx, url, string(body)}
|
|
}(i, u)
|
|
}
|
|
|
|
results := make([]result, len(urls))
|
|
for range urls {
|
|
r := <-ch
|
|
results[r.idx] = r
|
|
}
|
|
|
|
var sb strings.Builder
|
|
for _, r := range results {
|
|
if r.body != "" {
|
|
sb.WriteString(fmt.Sprintf("\n<!-- PAGE: %s -->\n", r.url))
|
|
sb.WriteString(r.body)
|
|
}
|
|
}
|
|
return sb.String()
|
|
}
|