From e80dcad126983dc52483cbea724d428f6d53b647 Mon Sep 17 00:00:00 2001 From: James Date: Wed, 25 Feb 2026 03:55:54 -0500 Subject: [PATCH] chore: add feature spec for responses/AI matching/assignment rules --- FEATURE_SPEC_RESPONSES.md | 333 +++++++++++++++++++++++++++++++++++ internal/handler/requests.go | 127 ++++++++++--- templates/dealroom.templ | 6 + 3 files changed, 446 insertions(+), 20 deletions(-) create mode 100644 FEATURE_SPEC_RESPONSES.md diff --git a/FEATURE_SPEC_RESPONSES.md b/FEATURE_SPEC_RESPONSES.md new file mode 100644 index 0000000..ed7ec88 --- /dev/null +++ b/FEATURE_SPEC_RESPONSES.md @@ -0,0 +1,333 @@ +# Feature Spec: Responses, AI Matching, Assignment Rules + +## Context +Dealspace needs to separate *what buyers ask* (requests) from *what sellers provide* (responses), with AI automatically discovering which responses satisfy which requests via embeddings. Confirmed by a human before being counted as answered. + +## Locked Decisions +- Assignment rules: per deal +- Statements (typed text answers): IN SCOPE +- Extraction: async background worker +- Confirmation: internal users only (RBAC refinement later) + +--- + +## 1. Schema Changes + +### New tables (add to migrate.go as CREATE TABLE IF NOT EXISTS in the migrations slice) + +```sql +-- Responses: seller-provided answers (document OR typed statement) +CREATE TABLE IF NOT EXISTS responses ( + id TEXT PRIMARY KEY, + deal_id TEXT NOT NULL, + type TEXT NOT NULL CHECK (type IN ('document','statement')), + title TEXT NOT NULL, + body TEXT DEFAULT '', -- markdown: extracted doc content OR typed text + file_id TEXT DEFAULT '', -- populated for type='document' + extraction_status TEXT DEFAULT 'pending' + CHECK (extraction_status IN ('pending','processing','done','failed')), + created_by TEXT DEFAULT '', + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + updated_at DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (deal_id) REFERENCES deals(id) +); + +-- Chunks: segments of a response body for fine-grained matching +CREATE TABLE IF NOT EXISTS response_chunks ( + id TEXT PRIMARY KEY, + response_id TEXT NOT NULL, + chunk_index INTEGER NOT NULL, + text TEXT NOT NULL, + vector BLOB NOT NULL, -- []float32 serialised as little-endian bytes + FOREIGN KEY (response_id) REFERENCES responses(id) +); + +-- N:M: AI-discovered links between requests and response chunks +CREATE TABLE IF NOT EXISTS request_links ( + request_id TEXT NOT NULL, + response_id TEXT NOT NULL, + chunk_id TEXT NOT NULL, + confidence REAL NOT NULL, -- cosine similarity 0-1 + auto_linked BOOLEAN DEFAULT 1, + confirmed BOOLEAN DEFAULT 0, + confirmed_by TEXT DEFAULT '', + confirmed_at DATETIME, + PRIMARY KEY (request_id, response_id, chunk_id) +); + +-- Assignment rules: keyword → assignee, per deal +CREATE TABLE IF NOT EXISTS assignment_rules ( + id TEXT PRIMARY KEY, + deal_id TEXT NOT NULL, + keyword TEXT NOT NULL, -- e.g. "Legal", "Tax", "HR" + assignee_id TEXT NOT NULL, -- profile ID + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (deal_id) REFERENCES deals(id) +); +``` + +### Additive migrations (append to additiveMigrationStmts in migrate.go) +```go +`ALTER TABLE diligence_requests ADD COLUMN assignee_id TEXT DEFAULT ''`, +`ALTER TABLE diligence_requests ADD COLUMN status TEXT DEFAULT 'open'`, +`ALTER TABLE files ADD COLUMN response_id TEXT DEFAULT ''`, +``` + +Note: status CHECK constraint can't be added via ALTER TABLE in SQLite — enforce in handler. + +--- + +## 2. Fireworks Client + +Create `internal/fireworks/client.go`: + +``` +Package: fireworks + +Fireworks API key: fw_RVcDe4c6mN4utKLsgA7hTm +Base URL: https://api.fireworks.ai/inference/v1 + +Functions needed: + +1. ExtractToMarkdown(ctx, imageBase64 []string, filename string) (string, error) + - Model: accounts/fireworks/models/llama-v3p2-90b-vision-instruct + - System prompt: "You are a document extraction expert. Extract ALL content from this document into clean markdown. Preserve headings, tables, lists, and structure. Do not summarise — extract everything." + - Send up to 10 images per call (multi-page docs: batch into 10-page chunks, concatenate results) + - For XLSX files (no images): use a different path — just send the structured data as text + - Return full markdown string + +2. EmbedText(ctx, texts []string) ([][]float32, error) + - Model: nomic-ai/nomic-embed-text-v1.5 + - POST /embeddings (OpenAI-compatible) + - Batch up to 50 texts per call + - Return [][]float32 + +3. CosineSimilarity(a, b []float32) float32 + - Pure Go dot product (normalised vectors) +``` + +--- + +## 3. PDF-to-Images Conversion + +Create `internal/extract/pdf.go`: + +``` +Use exec("pdftoppm") subprocess: + pdftoppm -jpeg -r 150 input.pdf /tmp/prefix + → produces /tmp/prefix-1.jpg, /tmp/prefix-2.jpg, ... + +Read each JPEG → base64 encode → pass to fireworks.ExtractToMarkdown + +For non-PDF files that are images (jpg/png): base64 encode directly, skip pdftoppm. +For XLSX: use excelize GetRows on all sheets → format as markdown table → skip vision model entirely. +For other binary types: attempt pdftoppm, fall back to filename+extension as minimal context. + +Function signature: + FileToImages(path string) ([]string, error) // returns base64-encoded JPEG strings +``` + +--- + +## 4. Chunker + +Create `internal/extract/chunker.go`: + +``` +ChunkMarkdown(text string) []string + - Split on markdown headings (## or ###) first + - If a section > 600 tokens (approx 2400 chars): split further at paragraph breaks (\n\n) + - If a paragraph > 600 tokens: split at sentence boundary (". ") + - Overlap: prepend last 80 chars of previous chunk to each chunk (context continuity) + - Minimum chunk length: 50 chars (discard shorter) + - Return []string of chunks +``` + +--- + +## 5. Extraction Worker + +Create `internal/worker/extractor.go`: + +``` +type ExtractionJob struct { + ResponseID string + FilePath string // absolute path to uploaded file (or "" for statements) + DealID string +} + +type Extractor struct { + db *sql.DB + fw *fireworks.Client + jobs chan ExtractionJob +} + +func NewExtractor(db *sql.DB, fw *fireworks.Client) *Extractor +func (e *Extractor) Start() // launch 2 worker goroutines +func (e *Extractor) Enqueue(job ExtractionJob) + +Worker loop: +1. Set responses.extraction_status = 'processing' +2. If file: + a. Convert to images (extract.FileToImages) + b. Call fw.ExtractToMarkdown → markdown body + c. UPDATE responses SET body=?, extraction_status='done' +3. If statement (body already set, skip extraction): + a. extraction_status → 'done' immediately +4. Chunk: extract.ChunkMarkdown(body) +5. Embed: fw.EmbedText(chunks) → [][]float32 +6. Store each chunk: INSERT INTO response_chunks (id, response_id, chunk_index, text, vector) + - Serialise []float32 as little-endian bytes: each float32 = 4 bytes +7. Match against all open requests in this deal: + a. Load all diligence_requests for deal_id (embed their descriptions if not already embedded) + b. Embed request descriptions that have no embedding yet (store in a simple in-memory cache or re-embed each run — re-embed is fine for now) + c. For each (chunk, request) pair: compute cosine similarity + d. If similarity >= 0.72: INSERT OR IGNORE INTO request_links (request_id, response_id, chunk_id, confidence, auto_linked=1, confirmed=0) +8. Log summary: "Response {id}: {N} chunks, {M} request links auto-created" +On error: SET extraction_status = 'failed', log error +``` + +--- + +## 6. Handler: Responses & Assignment Rules + +Create `internal/handler/responses.go`: + +``` +Handlers: + +POST /deals/responses/statement + - Fields: deal_id, title, body (markdown text) + - Create responses row (type='statement', extraction_status='pending') + - Enqueue extraction job (body already set, worker will chunk+embed+match) + - Redirect to /deals/{deal_id}?tab=requests + +POST /deals/responses/confirm + - Fields: request_id, response_id, chunk_id + - UPDATE request_links SET confirmed=1, confirmed_by=profile.ID, confirmed_at=now + - Return 200 OK (HTMX partial or redirect) + +POST /deals/responses/reject + - Fields: request_id, response_id, chunk_id + - DELETE FROM request_links WHERE ... + - Return 200 OK + +GET /deals/responses/pending/{dealID} + - Returns all request_links WHERE confirmed=0 AND auto_linked=1 + - Joined with requests (description) and responses (title, type) + - Returns JSON for HTMX partial + +POST /deals/assignment-rules/save + - Fields: deal_id, rules[] (keyword + assignee_id pairs, JSON array) + - DELETE existing rules for deal, INSERT new set + - On save: re-run auto-assignment for all unassigned requests in deal + - Redirect back to deal settings + +GET /deals/assignment-rules/{dealID} + - Returns JSON array of {id, keyword, assignee_id, assignee_name} + +Auto-assignment function (call on: rule save, request import): + func autoAssignRequests(db, dealID): + - Load all assignment_rules for deal_id + - For each diligence_request WHERE assignee_id = '': + - Check if section contains any rule keyword (case-insensitive) + - If match: UPDATE diligence_requests SET assignee_id = rule.assignee_id +``` + +--- + +## 7. Wire Up in handler.go + +Add to RegisterRoutes: +```go +// Responses & AI matching +mux.HandleFunc("/deals/responses/statement", h.requireAuth(h.handleCreateStatement)) +mux.HandleFunc("/deals/responses/confirm", h.requireAuth(h.handleConfirmLink)) +mux.HandleFunc("/deals/responses/reject", h.requireAuth(h.handleRejectLink)) +mux.HandleFunc("/deals/responses/pending/", h.requireAuth(h.handlePendingLinks)) +mux.HandleFunc("/deals/assignment-rules/save", h.requireAuth(h.handleSaveAssignmentRules)) +mux.HandleFunc("/deals/assignment-rules/", h.requireAuth(h.handleGetAssignmentRules)) +``` + +In Handler struct, add: +```go +extractor *worker.Extractor +fw *fireworks.Client +``` + +In New(): initialise both, call extractor.Start(). + +In handleFileUpload (files.go): after saving file, create a responses row (type='document') and enqueue extraction job. + +--- + +## 8. Template Changes + +### dealroom.templ — Requests tab + +Current requests tab shows a list of requests. Add: + +**A) Per-request: assignee + status badge** +- Show assignee name (or "Unassigned" in gray) next to each request +- Status pill: open (gray), in_progress (blue), answered (green), not_applicable (muted) +- If confirmed link exists: show "✓ Answered" with link to the response +- If pending auto-links exist: show "🤖 N AI matches — review" button (teal outline) + +**B) Pending AI matches panel** (shown above request list if any pending) +- Collapsible section: "🤖 X AI-suggested matches waiting for review" +- Each row: Request description | → | Response title | Confidence % | [Confirm] [Reject] +- Confirm/Reject use fetch() POST to /deals/responses/confirm or /reject, then reload + +**C) "Add Statement" button** (in requests toolbar) +- Opens a modal: Title + markdown textarea +- Submits to POST /deals/responses/statement +- After submit: shows in pending matches if AI matched any requests + +**D) Assignment rules** (accessible via a gear icon or "Settings" in requests tab header) +- Inline expandable panel or small modal +- Table: Keyword | Assignee (dropdown of internal team members) | [Remove] +- [Add Rule] row at bottom +- Save button → POST /deals/assignment-rules/save + +### Keep it clean +- Don't clutter the existing request rows — use progressive disclosure +- The "N AI matches" prompt should be prominent but not alarming +- Confidence shown as percentage (e.g. "87%"), not raw float + +--- + +## 9. Files tab: extraction status + +In the files table, add a small status indicator per file: +- ⏳ Extracting... (extraction_status = 'pending' or 'processing') +- ✓ (extraction_status = 'done') — subtle, no noise +- ⚠ (extraction_status = 'failed') — show tooltip with reason + +Poll via a simple setInterval (every 5s) that reloads the file list if any files are pending — only while any files are pending, stop polling once all done. + +--- + +## 10. Build & Deploy + +After all code changes: +1. Run: cd ~/dev/dealroom && PATH=$PATH:/home/johan/go/bin:/usr/local/go/bin make build +2. Run: systemctl --user stop dealroom && cp bin/dealroom dealroom && systemctl --user start dealroom +3. Verify: curl -s -o /dev/null -w "%{http_code}" http://localhost:9300/ (expect 303) +4. Check logs: journalctl --user -u dealroom -n 30 --no-pager +5. Run: cd ~/dev/dealroom && git add -A && git commit -m "feat: responses, AI matching, assignment rules" && git push origin main + +--- + +## Key Constants + +Fireworks API key: fw_RVcDe4c6mN4utKLsgA7hTm +Extraction model: accounts/fireworks/models/llama-v3p2-90b-vision-instruct +Embedding model: nomic-ai/nomic-embed-text-v1.5 +Match threshold: 0.72 cosine similarity +Chunk size: ~400 tokens / ~1600 chars max +Chunk overlap: ~80 chars +Max images per vision call: 10 +Worker concurrency: 2 goroutines + +Files are stored at: data/uploads/ (relative to WorkingDirectory in the service) +DB path: data/db/dealroom.db diff --git a/internal/handler/requests.go b/internal/handler/requests.go index 3dd6f15..d2e7b85 100644 --- a/internal/handler/requests.go +++ b/internal/handler/requests.go @@ -6,8 +6,12 @@ import ( "encoding/csv" "fmt" "io" + "log" "net/http" + "os" + "path/filepath" "strings" + "time" "dealroom/internal/rbac" "dealroom/templates" @@ -89,6 +93,17 @@ func (h *Handler) handleRequestListUpload(w http.ResponseWriter, r *http.Request return } + // Save uploaded file to data/uploads/ for inspection and reprocessing + uploadsDir := "data/uploads" + os.MkdirAll(uploadsDir, 0755) + stamp := time.Now().Format("20060102-150405") + saveName := filepath.Join(uploadsDir, stamp+"-"+dealID+"-"+header.Filename) + if err := os.WriteFile(saveName, raw, 0644); err != nil { + log.Printf("Warning: could not save uploaded file: %v", err) + } else { + log.Printf("Saved upload: %s (%d bytes)", saveName, len(raw)) + } + // Detect XLSX by filename extension or magic bytes (PK = zip/xlsx) fname := strings.ToLower(header.Filename) isXLSX := strings.HasSuffix(fname, ".xlsx") || strings.HasSuffix(fname, ".xls") || @@ -126,33 +141,105 @@ func (h *Handler) handleRequestListUpload(w http.ResponseWriter, r *http.Request rows = csvRows } - // Detect column indices from header row using common DD checklist naming conventions - // Falls back to positional (col 0=section, 1=item#, 2=description, 3=priority) - idxSection := 0 - idxItem := 1 - idxDesc := 2 - idxPriority := -1 + // Log first 12 rows of the file for debugging + for ri, row := range rows { + if ri >= 12 { + break + } + log.Printf("[upload-debug] row %d: %v\n", ri, row) + } - if len(rows) > 0 { - for ci, cell := range rows[0] { + // Scan up to first 12 rows to find the actual header row (highest keyword score). + // Many DD checklists have title/metadata rows before the real column headers. + idxSection := -1 + idxItem := -1 + idxDesc := -1 + idxPriority := -1 + headerRowIdx := 0 + bestScore := 0 + + for ri, record := range rows { + if ri >= 12 { + break + } + score := 0 + tmpSection, tmpItem, tmpDesc, tmpPri := -1, -1, -1, -1 + for ci, cell := range record { h := strings.ToLower(strings.TrimSpace(cell)) - switch { - case contains(h, "section", "category", "topic", "area", "phase", "workstream"): - idxSection = ci - case contains(h, "item #", "item#", "item no", "no.", "ref", "number", "#"): - idxItem = ci - case contains(h, "description", "request", "document", "information", "detail", "item") && ci != idxSection: - idxDesc = ci - case contains(h, "priority", "urgency", "importance", "criticality"): - idxPriority = ci + if h == "" { + continue + } + if contains(h, "section", "category", "topic", "area", "phase", "workstream") { + tmpSection = ci + score += 3 + } else if contains(h, "description", "request", "document", "information requested", "detail") { + tmpDesc = ci + score += 3 + } else if contains(h, "priority", "urgency", "importance", "criticality") { + tmpPri = ci + score += 2 + } else if h == "#" || h == "no." || h == "no" || h == "item #" || h == "item#" || + contains(h, "item no", "ref no", "ref #") { + tmpItem = ci + score += 2 + } + } + if score > bestScore { + bestScore = score + headerRowIdx = ri + if tmpSection >= 0 { + idxSection = tmpSection + } + if tmpItem >= 0 { + idxItem = tmpItem + } + if tmpDesc >= 0 { + idxDesc = tmpDesc + } + if tmpPri >= 0 { + idxPriority = tmpPri } } } + // If no header found, fall back to positional + if bestScore < 2 { + headerRowIdx = 0 + idxSection = 0 + idxItem = 1 + idxDesc = 2 + } + + // If desc still not found, guess: pick the column with the longest average text + if idxDesc < 0 && len(rows) > headerRowIdx+1 { + maxLen := 0 + for ci := range rows[headerRowIdx] { + total := 0 + count := 0 + for ri := headerRowIdx + 1; ri < len(rows) && ri < headerRowIdx+20; ri++ { + if ci < len(rows[ri]) { + total += len(strings.TrimSpace(rows[ri][ci])) + count++ + } + } + avg := 0 + if count > 0 { + avg = total / count + } + if avg > maxLen && ci != idxSection && ci != idxItem { + maxLen = avg + idxDesc = ci + } + } + } + + log.Printf("[upload-debug] header at row %d (score=%d) | section=%d item=%d desc=%d priority=%d\n", + headerRowIdx, bestScore, idxSection, idxItem, idxDesc, idxPriority) + var items []reqRow for ri, record := range rows { - if ri == 0 { - continue // skip header + if ri <= headerRowIdx { + continue // skip title rows + header row itself } if len(record) == 0 { continue @@ -253,7 +340,7 @@ func (h *Handler) handleRequestListUpload(w http.ResponseWriter, r *http.Request h.logActivity(dealID, profile.ID, profile.OrganizationID, "upload", "request_list", fmt.Sprintf("%d items", len(items)), "") - http.Redirect(w, r, "/deals/"+dealID, http.StatusSeeOther) + http.Redirect(w, r, "/deals/"+dealID+"?tab=requests", http.StatusSeeOther) } func (h *Handler) autoAssignFilesToRequests(dealID string) { diff --git a/templates/dealroom.templ b/templates/dealroom.templ index 1378a63..cfece0e 100644 --- a/templates/dealroom.templ +++ b/templates/dealroom.templ @@ -516,6 +516,12 @@ templ DealRoomDetail(profile *model.Profile, deal *model.Deal, folders []*model. : 'px-4 py-2 text-sm font-medium border-b-2 border-transparent text-gray-500 hover:text-gray-300'; } + // Auto-switch tab based on ?tab= query param (e.g. after request list upload) + (function() { + var tab = new URLSearchParams(window.location.search).get('tab'); + if (tab) { showTab(tab); } + })(); + function filterFiles() { var q = document.getElementById('fileSearch').value.toLowerCase(); document.querySelectorAll('.file-row').forEach(function(row) {