fix: replace naive byte-scan findTag with DICOM stream walker

The old findTag scanned raw bytes for the 4-byte tag pattern, which caused false matches inside Siemens CSA private OB blobs (e.g. the large 0029,1020 Series Header). This corrupted body_part and other fields on Siemens MAGNETOM Sola MRIs because findTag(0x0018, 0x0015) hit a matching byte sequence inside the binary payload before reaching the real BodyPartExamined element. Fix: walkToTag() walks the DICOM element stream sequentially, reading VR and length fields to skip element values entirely. Falls back to byte-scan only on corrupt/truncated length fields. findLastTag updated to use the same walker.
2026-03-23 12:09:31 -04:00 · 2026-03-23 12:09:31 -04:00 · bf57e28e71
parent 989969375d
commit bf57e28e71
1 changed files with 189 additions and 16 deletions
--- a/lib/dicom.go
+++ b/lib/dicom.go
@ -292,11 +292,78 @@ func safeExtractPath(destDir, name string) (string, error) {
 // DICOM TAG READING
 // ============================================================================

-func findTag(data []byte, group, elem uint16) int {
+// walkToTag walks the DICOM element stream from startPos, respecting VR/length
+// fields so it never matches tag bytes inside binary payloads (e.g. Siemens CSA
+// OB blobs). Returns the byte offset of the matching element, or -1.
+func walkToTag(data []byte, startPos int, group, elem uint16) int {
+	pos := startPos
+	n := len(data)
+	for pos+4 <= n {
+		if pos+4 > n {
+			break
+		}
+		g := binary.LittleEndian.Uint16(data[pos : pos+2])
+		e := binary.LittleEndian.Uint16(data[pos+2 : pos+4])
+		if g == group && e == elem {
+			return pos
+		}
+		// Determine value length to skip to next element.
+		// If we can't parse a sensible length, fall back to byte scan.
+		if pos+6 > n {
+			break
+		}
+		vr := string(data[pos+4 : pos+6])
+		var valLen uint32
+		var headerLen int
+		if isValidVR(data, pos+4) {
+			// Explicit VR
+			switch vr {
+			case "OB", "OW", "SQ", "UN", "OD", "UC", "UR", "UT":
+				// 4-byte reserved + 4-byte length
+				if pos+12 > n {
+					break
+				}
+				valLen = binary.LittleEndian.Uint32(data[pos+8 : pos+12])
+				headerLen = 12
+			default:
+				// 2-byte length
+				if pos+8 > n {
+					break
+				}
+				valLen = uint32(binary.LittleEndian.Uint16(data[pos+6 : pos+8]))
+				headerLen = 8
+			}
+		} else {
+			// Implicit VR: tag(4) + length(4)
+			if pos+8 > n {
+				break
+			}
+			valLen = binary.LittleEndian.Uint32(data[pos+4 : pos+8])
+			headerLen = 8
+		}
+		// 0xFFFFFFFF = undefined length (SQ/item) — step past header only and
+		// let the inner loop find the sequence delimiter naturally.
+		if valLen == 0xFFFFFFFF {
+			pos += headerLen
+		} else {
+			next := pos + headerLen + int(valLen)
+			if next <= pos || next > n {
+				// Corrupt/truncated length — fall back to byte-scan from here
+				return findTagBytes(data, pos+1, group, elem)
+			}
+			pos = next
+		}
+	}
+	return -1
+}
+
+// findTagBytes is the original byte-scan fallback used only when the stream
+// walker cannot continue (corrupt length field).
+func findTagBytes(data []byte, startPos int, group, elem uint16) int {
 	target := make([]byte, 4)
 	binary.LittleEndian.PutUint16(target[0:2], group)
 	binary.LittleEndian.PutUint16(target[2:4], elem)
-	for i := 0; i < len(data)-4; i++ {
+	for i := startPos; i < len(data)-4; i++ {
 		if data[i] == target[0] && data[i+1] == target[1] &&
 			data[i+2] == target[2] && data[i+3] == target[3] {
 			return i
@ -305,21 +372,76 @@ func findTag(data []byte, group, elem uint16) int {
 	return -1
 }

-func findLastTag(data []byte, group, elem uint16) int {
-	target := make([]byte, 4)
-	binary.LittleEndian.PutUint16(target[0:2], group)
-	binary.LittleEndian.PutUint16(target[2:4], elem)
-	lastPos := -1
-	for i := 0; i < len(data)-4; i++ {
-		if data[i] == target[0] && data[i+1] == target[1] &&
-			data[i+2] == target[2] && data[i+3] == target[3] {
-			lastPos = i
+// findTag finds the first occurrence of a DICOM tag, using the stream walker
+// starting from the DICOM preamble offset (128-byte preamble + 4-byte DICM).
+func findTag(data []byte, group, elem uint16) int {
+	// DICOM files start with 128-byte preamble + "DICM" magic.
+	// Meta header (group 0x0002) always lives there; for the main dataset
+	// we start the walk right after the preamble when present.
+	startPos := 0
+	if len(data) >= 132 && string(data[128:132]) == "DICM" {
+		startPos = 132
+		// For meta-header tags (group 0x0002), walk from 132.
+		// For dataset tags, also walk from 132 — the walker handles both.
 	}
+	result := walkToTag(data, startPos, group, elem)
+	if result < 0 && startPos > 0 {
+		// Retry from byte 0 for edge cases (no preamble, raw DICOM)
+		result = walkToTag(data, 0, group, elem)
+	}
+	return result
+}
+
+func findLastTag(data []byte, group, elem uint16) int {
+	// Walk the full stream and keep the last match position.
+	startPos := 0
+	if len(data) >= 132 && string(data[128:132]) == "DICM" {
+		startPos = 132
+	}
+	lastPos := -1
+	pos := startPos
+	for {
+		found := walkToTag(data, pos, group, elem)
+		if found < 0 {
+			break
+		}
+		lastPos = found
+		pos = found + 1 // advance past this match to find a later one
 	}
 	return lastPos
 }

-func readStringTag(data []byte, group, elem uint16) string {
+// isValidVR checks if the 2 bytes at offset look like a valid DICOM VR
+func isValidVR(data []byte, offset int) bool {
+	if offset+2 > len(data) {
+		return false
+	}
+	vr := string(data[offset : offset+2])
+	validVRs := map[string]bool{
+		"AE": true, "AS": true, "AT": true, "CS": true, "DA": true, "DS": true, "DT": true,
+		"FL": true, "FD": true, "IS": true, "LO": true, "LT": true, "OB": true, "OD": true,
+		"OF": true, "OW": true, "PN": true, "SH": true, "SL": true, "SQ": true, "SS": true,
+		"ST": true, "TM": true, "UC": true, "UI": true, "UL": true, "UN": true, "UR": true,
+		"US": true, "UT": true,
+	}
+	return validVRs[vr]
+}
+
+// isImplicitVR returns true if transfer syntax uses implicit VR (no VR field in data elements)
+func isImplicitVR(data []byte) bool {
+	// Check transfer syntax UID from file meta info (group 0x0002)
+	ts := readStringTagExplicit(data, 0x0002, 0x0010)
+	if ts == "" {
+		// No transfer syntax specified - default to Explicit VR Little Endian
+		return false
+	}
+	// Implicit VR Little Endian: 1.2.840.10008.1.2
+	// Also check for Siemens private implicit VR variants
+	return ts == "1.2.840.10008.1.2" || strings.Contains(ts, "1.2.276.0.7230010")
+}
+
+// readStringTagExplicit reads with explicit VR assumption (for meta-header)
+func readStringTagExplicit(data []byte, group, elem uint16) string {
 	pos := findTag(data, group, elem)
 	if pos < 0 {
 		return ""
@ -327,7 +449,54 @@ func readStringTag(data []byte, group, elem uint16) string {
 	vr := string(data[pos+4 : pos+6])
 	var length uint16
 	var valPos int
-	if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" {
+	if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" || vr == "OD" || vr == "UC" || vr == "UT" {
+		length = uint16(binary.LittleEndian.Uint32(data[pos+8 : pos+12]))
+		valPos = pos + 12
+	} else {
+		length = binary.LittleEndian.Uint16(data[pos+6 : pos+8])
+		valPos = pos + 8
+	}
+	if valPos+int(length) > len(data) {
+		return ""
+	}
+	raw := data[valPos : valPos+int(length)]
+	return strings.TrimRight(string(raw), " \x00")
+}
+
+func readStringTag(data []byte, group, elem uint16) string {
+	pos := findTag(data, group, elem)
+	if pos < 0 {
+		return ""
+	}
+
+	// Check for implicit VR by validating the VR field
+	implicitVR := !isValidVR(data, pos+4)
+	if implicitVR {
+		// Implicit VR: tag (4) + length (4) + value
+		length := binary.LittleEndian.Uint32(data[pos+4 : pos+8])
+		valPos := pos + 8
+		if valPos+int(length) > len(data) {
+			return ""
+		}
+		raw := data[valPos : valPos+int(length)]
+		var s string
+		if utf8.Valid(raw) {
+			s = string(raw)
+		} else {
+			runes := make([]rune, len(raw))
+			for i, b := range raw {
+				runes[i] = rune(b)
+			}
+			s = string(runes)
+		}
+		return strings.TrimRight(s, " \x00")
+	}
+
+	// Explicit VR path
+	vr := string(data[pos+4 : pos+6])
+	var length uint16
+	var valPos int
+	if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" || vr == "OD" || vr == "UC" || vr == "UT" {
 		length = uint16(binary.LittleEndian.Uint32(data[pos+8 : pos+12]))
 		valPos = pos + 12
 	} else {
@ -366,6 +535,8 @@ func readIntTagSmart(data []byte, group, elem uint16) int {
 	if pos < 0 {
 		return 0
 	}
+	// Check for implicit VR before reading VR field
+	if isValidVR(data, pos+4) {
 		vr := string(data[pos+4 : pos+6])
 		if vr == "US" || vr == "SS" {
 			valPos := pos + 8
@ -373,6 +544,8 @@ func readIntTagSmart(data []byte, group, elem uint16) int {
 				return int(binary.LittleEndian.Uint16(data[valPos : valPos+2]))
 			}
 		}
+	}
+	// For implicit VR or non-integer VRs, fall back to string parsing
 	s := strings.TrimSpace(readStringTag(data, group, elem))
 	n, _ := strconv.Atoi(s)
 	return n