package extract import ( "strings" ) const ( maxChunkChars = 1600 overlapChars = 80 minChunkChars = 50 ) // ChunkMarkdown splits markdown text into overlapping chunks for embedding. func ChunkMarkdown(text string) []string { if strings.TrimSpace(text) == "" { return nil } // Split on headings first sections := splitOnHeadings(text) var chunks []string for _, section := range sections { section = strings.TrimSpace(section) if len(section) < minChunkChars { continue } if len(section) <= maxChunkChars { chunks = append(chunks, section) continue } // Split further at paragraph breaks paragraphs := strings.Split(section, "\n\n") var current strings.Builder for _, para := range paragraphs { para = strings.TrimSpace(para) if para == "" { continue } if current.Len()+len(para)+2 > maxChunkChars && current.Len() > 0 { chunks = append(chunks, current.String()) current.Reset() } if len(para) > maxChunkChars { // Flush current buffer first if current.Len() > 0 { chunks = append(chunks, current.String()) current.Reset() } // Split at sentence boundaries sentences := splitSentences(para) for _, sent := range sentences { if current.Len()+len(sent)+1 > maxChunkChars && current.Len() > 0 { chunks = append(chunks, current.String()) current.Reset() } if current.Len() > 0 { current.WriteString(" ") } current.WriteString(sent) } } else { if current.Len() > 0 { current.WriteString("\n\n") } current.WriteString(para) } } if current.Len() > 0 { chunks = append(chunks, current.String()) } } // Apply overlap if len(chunks) > 1 { overlapped := make([]string, len(chunks)) overlapped[0] = chunks[0] for i := 1; i < len(chunks); i++ { prev := chunks[i-1] overlap := prev if len(overlap) > overlapChars { overlap = overlap[len(overlap)-overlapChars:] } overlapped[i] = overlap + " " + chunks[i] } chunks = overlapped } // Filter out too-short chunks var result []string for _, c := range chunks { if len(strings.TrimSpace(c)) >= minChunkChars { result = append(result, c) } } return result } func splitOnHeadings(text string) []string { lines := strings.Split(text, "\n") var sections []string var current strings.Builder for _, line := range lines { trimmed := strings.TrimSpace(line) if (strings.HasPrefix(trimmed, "## ") || strings.HasPrefix(trimmed, "### ")) && current.Len() > 0 { sections = append(sections, current.String()) current.Reset() } current.WriteString(line) current.WriteString("\n") } if current.Len() > 0 { sections = append(sections, current.String()) } return sections } func splitSentences(text string) []string { // Split on ". " while preserving the period parts := strings.Split(text, ". ") var sentences []string for i, p := range parts { p = strings.TrimSpace(p) if p == "" { continue } if i < len(parts)-1 { p += "." } sentences = append(sentences, p) } return sentences }