From bf57e28e71f212bd6e4d6af2dcc0c5f99bc2e62a Mon Sep 17 00:00:00 2001 From: James Date: Mon, 23 Mar 2026 12:09:31 -0400 Subject: [PATCH] fix: replace naive byte-scan findTag with DICOM stream walker The old findTag scanned raw bytes for the 4-byte tag pattern, which caused false matches inside Siemens CSA private OB blobs (e.g. the large 0029,1020 Series Header). This corrupted body_part and other fields on Siemens MAGNETOM Sola MRIs because findTag(0x0018, 0x0015) hit a matching byte sequence inside the binary payload before reaching the real BodyPartExamined element. Fix: walkToTag() walks the DICOM element stream sequentially, reading VR and length fields to skip element values entirely. Falls back to byte-scan only on corrupt/truncated length fields. findLastTag updated to use the same walker. --- lib/dicom.go | 205 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 189 insertions(+), 16 deletions(-) diff --git a/lib/dicom.go b/lib/dicom.go index 0910e50..7640c3e 100644 --- a/lib/dicom.go +++ b/lib/dicom.go @@ -292,11 +292,78 @@ func safeExtractPath(destDir, name string) (string, error) { // DICOM TAG READING // ============================================================================ -func findTag(data []byte, group, elem uint16) int { +// walkToTag walks the DICOM element stream from startPos, respecting VR/length +// fields so it never matches tag bytes inside binary payloads (e.g. Siemens CSA +// OB blobs). Returns the byte offset of the matching element, or -1. +func walkToTag(data []byte, startPos int, group, elem uint16) int { + pos := startPos + n := len(data) + for pos+4 <= n { + if pos+4 > n { + break + } + g := binary.LittleEndian.Uint16(data[pos : pos+2]) + e := binary.LittleEndian.Uint16(data[pos+2 : pos+4]) + if g == group && e == elem { + return pos + } + // Determine value length to skip to next element. + // If we can't parse a sensible length, fall back to byte scan. + if pos+6 > n { + break + } + vr := string(data[pos+4 : pos+6]) + var valLen uint32 + var headerLen int + if isValidVR(data, pos+4) { + // Explicit VR + switch vr { + case "OB", "OW", "SQ", "UN", "OD", "UC", "UR", "UT": + // 4-byte reserved + 4-byte length + if pos+12 > n { + break + } + valLen = binary.LittleEndian.Uint32(data[pos+8 : pos+12]) + headerLen = 12 + default: + // 2-byte length + if pos+8 > n { + break + } + valLen = uint32(binary.LittleEndian.Uint16(data[pos+6 : pos+8])) + headerLen = 8 + } + } else { + // Implicit VR: tag(4) + length(4) + if pos+8 > n { + break + } + valLen = binary.LittleEndian.Uint32(data[pos+4 : pos+8]) + headerLen = 8 + } + // 0xFFFFFFFF = undefined length (SQ/item) — step past header only and + // let the inner loop find the sequence delimiter naturally. + if valLen == 0xFFFFFFFF { + pos += headerLen + } else { + next := pos + headerLen + int(valLen) + if next <= pos || next > n { + // Corrupt/truncated length — fall back to byte-scan from here + return findTagBytes(data, pos+1, group, elem) + } + pos = next + } + } + return -1 +} + +// findTagBytes is the original byte-scan fallback used only when the stream +// walker cannot continue (corrupt length field). +func findTagBytes(data []byte, startPos int, group, elem uint16) int { target := make([]byte, 4) binary.LittleEndian.PutUint16(target[0:2], group) binary.LittleEndian.PutUint16(target[2:4], elem) - for i := 0; i < len(data)-4; i++ { + for i := startPos; i < len(data)-4; i++ { if data[i] == target[0] && data[i+1] == target[1] && data[i+2] == target[2] && data[i+3] == target[3] { return i @@ -305,21 +372,76 @@ func findTag(data []byte, group, elem uint16) int { return -1 } +// findTag finds the first occurrence of a DICOM tag, using the stream walker +// starting from the DICOM preamble offset (128-byte preamble + 4-byte DICM). +func findTag(data []byte, group, elem uint16) int { + // DICOM files start with 128-byte preamble + "DICM" magic. + // Meta header (group 0x0002) always lives there; for the main dataset + // we start the walk right after the preamble when present. + startPos := 0 + if len(data) >= 132 && string(data[128:132]) == "DICM" { + startPos = 132 + // For meta-header tags (group 0x0002), walk from 132. + // For dataset tags, also walk from 132 — the walker handles both. + } + result := walkToTag(data, startPos, group, elem) + if result < 0 && startPos > 0 { + // Retry from byte 0 for edge cases (no preamble, raw DICOM) + result = walkToTag(data, 0, group, elem) + } + return result +} + func findLastTag(data []byte, group, elem uint16) int { - target := make([]byte, 4) - binary.LittleEndian.PutUint16(target[0:2], group) - binary.LittleEndian.PutUint16(target[2:4], elem) + // Walk the full stream and keep the last match position. + startPos := 0 + if len(data) >= 132 && string(data[128:132]) == "DICM" { + startPos = 132 + } lastPos := -1 - for i := 0; i < len(data)-4; i++ { - if data[i] == target[0] && data[i+1] == target[1] && - data[i+2] == target[2] && data[i+3] == target[3] { - lastPos = i + pos := startPos + for { + found := walkToTag(data, pos, group, elem) + if found < 0 { + break } + lastPos = found + pos = found + 1 // advance past this match to find a later one } return lastPos } -func readStringTag(data []byte, group, elem uint16) string { +// isValidVR checks if the 2 bytes at offset look like a valid DICOM VR +func isValidVR(data []byte, offset int) bool { + if offset+2 > len(data) { + return false + } + vr := string(data[offset : offset+2]) + validVRs := map[string]bool{ + "AE": true, "AS": true, "AT": true, "CS": true, "DA": true, "DS": true, "DT": true, + "FL": true, "FD": true, "IS": true, "LO": true, "LT": true, "OB": true, "OD": true, + "OF": true, "OW": true, "PN": true, "SH": true, "SL": true, "SQ": true, "SS": true, + "ST": true, "TM": true, "UC": true, "UI": true, "UL": true, "UN": true, "UR": true, + "US": true, "UT": true, + } + return validVRs[vr] +} + +// isImplicitVR returns true if transfer syntax uses implicit VR (no VR field in data elements) +func isImplicitVR(data []byte) bool { + // Check transfer syntax UID from file meta info (group 0x0002) + ts := readStringTagExplicit(data, 0x0002, 0x0010) + if ts == "" { + // No transfer syntax specified - default to Explicit VR Little Endian + return false + } + // Implicit VR Little Endian: 1.2.840.10008.1.2 + // Also check for Siemens private implicit VR variants + return ts == "1.2.840.10008.1.2" || strings.Contains(ts, "1.2.276.0.7230010") +} + +// readStringTagExplicit reads with explicit VR assumption (for meta-header) +func readStringTagExplicit(data []byte, group, elem uint16) string { pos := findTag(data, group, elem) if pos < 0 { return "" @@ -327,7 +449,54 @@ func readStringTag(data []byte, group, elem uint16) string { vr := string(data[pos+4 : pos+6]) var length uint16 var valPos int - if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" { + if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" || vr == "OD" || vr == "UC" || vr == "UT" { + length = uint16(binary.LittleEndian.Uint32(data[pos+8 : pos+12])) + valPos = pos + 12 + } else { + length = binary.LittleEndian.Uint16(data[pos+6 : pos+8]) + valPos = pos + 8 + } + if valPos+int(length) > len(data) { + return "" + } + raw := data[valPos : valPos+int(length)] + return strings.TrimRight(string(raw), " \x00") +} + +func readStringTag(data []byte, group, elem uint16) string { + pos := findTag(data, group, elem) + if pos < 0 { + return "" + } + + // Check for implicit VR by validating the VR field + implicitVR := !isValidVR(data, pos+4) + if implicitVR { + // Implicit VR: tag (4) + length (4) + value + length := binary.LittleEndian.Uint32(data[pos+4 : pos+8]) + valPos := pos + 8 + if valPos+int(length) > len(data) { + return "" + } + raw := data[valPos : valPos+int(length)] + var s string + if utf8.Valid(raw) { + s = string(raw) + } else { + runes := make([]rune, len(raw)) + for i, b := range raw { + runes[i] = rune(b) + } + s = string(runes) + } + return strings.TrimRight(s, " \x00") + } + + // Explicit VR path + vr := string(data[pos+4 : pos+6]) + var length uint16 + var valPos int + if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" || vr == "OD" || vr == "UC" || vr == "UT" { length = uint16(binary.LittleEndian.Uint32(data[pos+8 : pos+12])) valPos = pos + 12 } else { @@ -366,13 +535,17 @@ func readIntTagSmart(data []byte, group, elem uint16) int { if pos < 0 { return 0 } - vr := string(data[pos+4 : pos+6]) - if vr == "US" || vr == "SS" { - valPos := pos + 8 - if valPos+2 <= len(data) { - return int(binary.LittleEndian.Uint16(data[valPos : valPos+2])) + // Check for implicit VR before reading VR field + if isValidVR(data, pos+4) { + vr := string(data[pos+4 : pos+6]) + if vr == "US" || vr == "SS" { + valPos := pos + 8 + if valPos+2 <= len(data) { + return int(binary.LittleEndian.Uint16(data[valPos : valPos+2])) + } } } + // For implicit VR or non-integer VRs, fall back to string parsing s := strings.TrimSpace(readStringTag(data, group, elem)) n, _ := strconv.Atoi(s) return n