141 lines
3.0 KiB
Go
141 lines
3.0 KiB
Go
package extract
|
|
|
|
import (
|
|
"strings"
|
|
)
|
|
|
|
const (
|
|
maxChunkChars = 1600
|
|
overlapChars = 80
|
|
minChunkChars = 50
|
|
)
|
|
|
|
// ChunkMarkdown splits markdown text into overlapping chunks for embedding.
|
|
func ChunkMarkdown(text string) []string {
|
|
if strings.TrimSpace(text) == "" {
|
|
return nil
|
|
}
|
|
|
|
// Split on headings first
|
|
sections := splitOnHeadings(text)
|
|
|
|
var chunks []string
|
|
for _, section := range sections {
|
|
section = strings.TrimSpace(section)
|
|
if len(section) < minChunkChars {
|
|
continue
|
|
}
|
|
|
|
if len(section) <= maxChunkChars {
|
|
chunks = append(chunks, section)
|
|
continue
|
|
}
|
|
|
|
// Split further at paragraph breaks
|
|
paragraphs := strings.Split(section, "\n\n")
|
|
var current strings.Builder
|
|
for _, para := range paragraphs {
|
|
para = strings.TrimSpace(para)
|
|
if para == "" {
|
|
continue
|
|
}
|
|
|
|
if current.Len()+len(para)+2 > maxChunkChars && current.Len() > 0 {
|
|
chunks = append(chunks, current.String())
|
|
current.Reset()
|
|
}
|
|
|
|
if len(para) > maxChunkChars {
|
|
// Flush current buffer first
|
|
if current.Len() > 0 {
|
|
chunks = append(chunks, current.String())
|
|
current.Reset()
|
|
}
|
|
// Split at sentence boundaries
|
|
sentences := splitSentences(para)
|
|
for _, sent := range sentences {
|
|
if current.Len()+len(sent)+1 > maxChunkChars && current.Len() > 0 {
|
|
chunks = append(chunks, current.String())
|
|
current.Reset()
|
|
}
|
|
if current.Len() > 0 {
|
|
current.WriteString(" ")
|
|
}
|
|
current.WriteString(sent)
|
|
}
|
|
} else {
|
|
if current.Len() > 0 {
|
|
current.WriteString("\n\n")
|
|
}
|
|
current.WriteString(para)
|
|
}
|
|
}
|
|
if current.Len() > 0 {
|
|
chunks = append(chunks, current.String())
|
|
}
|
|
}
|
|
|
|
// Apply overlap
|
|
if len(chunks) > 1 {
|
|
overlapped := make([]string, len(chunks))
|
|
overlapped[0] = chunks[0]
|
|
for i := 1; i < len(chunks); i++ {
|
|
prev := chunks[i-1]
|
|
overlap := prev
|
|
if len(overlap) > overlapChars {
|
|
overlap = overlap[len(overlap)-overlapChars:]
|
|
}
|
|
overlapped[i] = overlap + " " + chunks[i]
|
|
}
|
|
chunks = overlapped
|
|
}
|
|
|
|
// Filter out too-short chunks
|
|
var result []string
|
|
for _, c := range chunks {
|
|
if len(strings.TrimSpace(c)) >= minChunkChars {
|
|
result = append(result, c)
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func splitOnHeadings(text string) []string {
|
|
lines := strings.Split(text, "\n")
|
|
var sections []string
|
|
var current strings.Builder
|
|
|
|
for _, line := range lines {
|
|
trimmed := strings.TrimSpace(line)
|
|
if (strings.HasPrefix(trimmed, "## ") || strings.HasPrefix(trimmed, "### ")) && current.Len() > 0 {
|
|
sections = append(sections, current.String())
|
|
current.Reset()
|
|
}
|
|
current.WriteString(line)
|
|
current.WriteString("\n")
|
|
}
|
|
if current.Len() > 0 {
|
|
sections = append(sections, current.String())
|
|
}
|
|
|
|
return sections
|
|
}
|
|
|
|
func splitSentences(text string) []string {
|
|
// Split on ". " while preserving the period
|
|
parts := strings.Split(text, ". ")
|
|
var sentences []string
|
|
for i, p := range parts {
|
|
p = strings.TrimSpace(p)
|
|
if p == "" {
|
|
continue
|
|
}
|
|
if i < len(parts)-1 {
|
|
p += "."
|
|
}
|
|
sentences = append(sentences, p)
|
|
}
|
|
return sentences
|
|
}
|