dealroom/internal/extract/chunker.go

141 lines
3.0 KiB
Go

package extract
import (
"strings"
)
const (
maxChunkChars = 1600
overlapChars = 80
minChunkChars = 50
)
// ChunkMarkdown splits markdown text into overlapping chunks for embedding.
func ChunkMarkdown(text string) []string {
if strings.TrimSpace(text) == "" {
return nil
}
// Split on headings first
sections := splitOnHeadings(text)
var chunks []string
for _, section := range sections {
section = strings.TrimSpace(section)
if len(section) < minChunkChars {
continue
}
if len(section) <= maxChunkChars {
chunks = append(chunks, section)
continue
}
// Split further at paragraph breaks
paragraphs := strings.Split(section, "\n\n")
var current strings.Builder
for _, para := range paragraphs {
para = strings.TrimSpace(para)
if para == "" {
continue
}
if current.Len()+len(para)+2 > maxChunkChars && current.Len() > 0 {
chunks = append(chunks, current.String())
current.Reset()
}
if len(para) > maxChunkChars {
// Flush current buffer first
if current.Len() > 0 {
chunks = append(chunks, current.String())
current.Reset()
}
// Split at sentence boundaries
sentences := splitSentences(para)
for _, sent := range sentences {
if current.Len()+len(sent)+1 > maxChunkChars && current.Len() > 0 {
chunks = append(chunks, current.String())
current.Reset()
}
if current.Len() > 0 {
current.WriteString(" ")
}
current.WriteString(sent)
}
} else {
if current.Len() > 0 {
current.WriteString("\n\n")
}
current.WriteString(para)
}
}
if current.Len() > 0 {
chunks = append(chunks, current.String())
}
}
// Apply overlap
if len(chunks) > 1 {
overlapped := make([]string, len(chunks))
overlapped[0] = chunks[0]
for i := 1; i < len(chunks); i++ {
prev := chunks[i-1]
overlap := prev
if len(overlap) > overlapChars {
overlap = overlap[len(overlap)-overlapChars:]
}
overlapped[i] = overlap + " " + chunks[i]
}
chunks = overlapped
}
// Filter out too-short chunks
var result []string
for _, c := range chunks {
if len(strings.TrimSpace(c)) >= minChunkChars {
result = append(result, c)
}
}
return result
}
func splitOnHeadings(text string) []string {
lines := strings.Split(text, "\n")
var sections []string
var current strings.Builder
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if (strings.HasPrefix(trimmed, "## ") || strings.HasPrefix(trimmed, "### ")) && current.Len() > 0 {
sections = append(sections, current.String())
current.Reset()
}
current.WriteString(line)
current.WriteString("\n")
}
if current.Len() > 0 {
sections = append(sections, current.String())
}
return sections
}
func splitSentences(text string) []string {
// Split on ". " while preserving the period
parts := strings.Split(text, ". ")
var sentences []string
for i, p := range parts {
p = strings.TrimSpace(p)
if p == "" {
continue
}
if i < len(parts)-1 {
p += "."
}
sentences = append(sentences, p)
}
return sentences
}