fix: replace naive byte-scan findTag with DICOM stream walker

The old findTag scanned raw bytes for the 4-byte tag pattern, which caused false matches inside Siemens CSA private OB blobs (e.g. the large 0029,1020 Series Header). This corrupted body_part and other fields on Siemens MAGNETOM Sola MRIs because findTag(0x0018, 0x0015) hit a matching byte sequence inside the binary payload before reaching the real BodyPartExamined element. Fix: walkToTag() walks the DICOM element stream sequentially, reading VR and length fields to skip element values entirely. Falls back to byte-scan only on corrupt/truncated length fields. findLastTag updated to use the same walker.
2026-03-23 12:09:31 -04:00 · 2026-03-23 12:09:31 -04:00 · bf57e28e71
parent 989969375d
commit bf57e28e71
1 changed files with 189 additions and 16 deletions
--- a/lib/dicom.go
+++ b/lib/dicom.go
@ -292,11 +292,78 @@ func safeExtractPath(destDir, name string) (string, error) {
 // DICOM TAG READING
 // ============================================================================
-func findTag(data []byte, group, elem uint16) int {
+// walkToTag walks the DICOM element stream from startPos, respecting VR/length
 // fields so it never matches tag bytes inside binary payloads (e.g. Siemens CSA
 // OB blobs). Returns the byte offset of the matching element, or -1.
 func walkToTag(data []byte, startPos int, group, elem uint16) int {
 	pos := startPos
 	n := len(data)
 	for pos+4 <= n {
 		if pos+4 > n {
 			break
 		}
 		g := binary.LittleEndian.Uint16(data[pos : pos+2])
 		e := binary.LittleEndian.Uint16(data[pos+2 : pos+4])
 		if g == group && e == elem {
 			return pos
 		}
 		// Determine value length to skip to next element.
 		// If we can't parse a sensible length, fall back to byte scan.
 		if pos+6 > n {
 			break
 		}
 		vr := string(data[pos+4 : pos+6])
 		var valLen uint32
 		var headerLen int
 		if isValidVR(data, pos+4) {
 			// Explicit VR
 			switch vr {
 			case "OB", "OW", "SQ", "UN", "OD", "UC", "UR", "UT":
 				// 4-byte reserved + 4-byte length
 				if pos+12 > n {
 					break
 				}
 				valLen = binary.LittleEndian.Uint32(data[pos+8 : pos+12])
 				headerLen = 12
 			default:
 				// 2-byte length
 				if pos+8 > n {
 					break
 				}
 				valLen = uint32(binary.LittleEndian.Uint16(data[pos+6 : pos+8]))
 				headerLen = 8
 			}
 		} else {
 			// Implicit VR: tag(4) + length(4)
 			if pos+8 > n {
 				break
 			}
 			valLen = binary.LittleEndian.Uint32(data[pos+4 : pos+8])
 			headerLen = 8
 		}
 		// 0xFFFFFFFF = undefined length (SQ/item) — step past header only and
 		// let the inner loop find the sequence delimiter naturally.
 		if valLen == 0xFFFFFFFF {
 			pos += headerLen
 		} else {
 			next := pos + headerLen + int(valLen)
 			if next <= pos || next > n {
 				// Corrupt/truncated length — fall back to byte-scan from here
 				return findTagBytes(data, pos+1, group, elem)
 			}
 			pos = next
 		}
 	}
 	return -1
 }
 // findTagBytes is the original byte-scan fallback used only when the stream
 // walker cannot continue (corrupt length field).
 func findTagBytes(data []byte, startPos int, group, elem uint16) int {
 	target := make([]byte, 4)
 	binary.LittleEndian.PutUint16(target[0:2], group)
 	binary.LittleEndian.PutUint16(target[2:4], elem)
-	for i := 0; i < len(data)-4; i++ {
+	for i := startPos; i < len(data)-4; i++ {
 		if data[i] == target[0] && data[i+1] == target[1] &&
 			data[i+2] == target[2] && data[i+3] == target[3] {
 			return i
@ -305,21 +372,76 @@ func findTag(data []byte, group, elem uint16) int {
 	return -1
 }
-func findLastTag(data []byte, group, elem uint16) int {
+// findTag finds the first occurrence of a DICOM tag, using the stream walker
-	target := make([]byte, 4)
+// starting from the DICOM preamble offset (128-byte preamble + 4-byte DICM).
-	binary.LittleEndian.PutUint16(target[0:2], group)
+func findTag(data []byte, group, elem uint16) int {
-	binary.LittleEndian.PutUint16(target[2:4], elem)
+	// DICOM files start with 128-byte preamble + "DICM" magic.
-	lastPos := -1
+	// Meta header (group 0x0002) always lives there; for the main dataset
-	for i := 0; i < len(data)-4; i++ {
+	// we start the walk right after the preamble when present.
-		if data[i] == target[0] && data[i+1] == target[1] &&
+	startPos := 0
-			data[i+2] == target[2] && data[i+3] == target[3] {
+	if len(data) >= 132 && string(data[128:132]) == "DICM" {
-			lastPos = i
+		startPos = 132
 		// For meta-header tags (group 0x0002), walk from 132.
 		// For dataset tags, also walk from 132 — the walker handles both.
 	}
 	result := walkToTag(data, startPos, group, elem)
 	if result < 0 && startPos > 0 {
 		// Retry from byte 0 for edge cases (no preamble, raw DICOM)
 		result = walkToTag(data, 0, group, elem)
 	}
 	return result
 }
 func findLastTag(data []byte, group, elem uint16) int {
 	// Walk the full stream and keep the last match position.
 	startPos := 0
 	if len(data) >= 132 && string(data[128:132]) == "DICM" {
 		startPos = 132
 	}
 	lastPos := -1
 	pos := startPos
 	for {
 		found := walkToTag(data, pos, group, elem)
 		if found < 0 {
 			break
 		}
 		lastPos = found
 		pos = found + 1 // advance past this match to find a later one
 	}
 	return lastPos
 }
-func readStringTag(data []byte, group, elem uint16) string {
+// isValidVR checks if the 2 bytes at offset look like a valid DICOM VR
 func isValidVR(data []byte, offset int) bool {
 	if offset+2 > len(data) {
 		return false
 	}
 	vr := string(data[offset : offset+2])
 	validVRs := map[string]bool{
 		"AE": true, "AS": true, "AT": true, "CS": true, "DA": true, "DS": true, "DT": true,
 		"FL": true, "FD": true, "IS": true, "LO": true, "LT": true, "OB": true, "OD": true,
 		"OF": true, "OW": true, "PN": true, "SH": true, "SL": true, "SQ": true, "SS": true,
 		"ST": true, "TM": true, "UC": true, "UI": true, "UL": true, "UN": true, "UR": true,
 		"US": true, "UT": true,
 	}
 	return validVRs[vr]
 }
 // isImplicitVR returns true if transfer syntax uses implicit VR (no VR field in data elements)
 func isImplicitVR(data []byte) bool {
 	// Check transfer syntax UID from file meta info (group 0x0002)
 	ts := readStringTagExplicit(data, 0x0002, 0x0010)
 	if ts == "" {
 		// No transfer syntax specified - default to Explicit VR Little Endian
 		return false
 	}
 	// Implicit VR Little Endian: 1.2.840.10008.1.2
 	// Also check for Siemens private implicit VR variants
 	return ts == "1.2.840.10008.1.2" || strings.Contains(ts, "1.2.276.0.7230010")
 }
 // readStringTagExplicit reads with explicit VR assumption (for meta-header)
 func readStringTagExplicit(data []byte, group, elem uint16) string {
 	pos := findTag(data, group, elem)
 	if pos < 0 {
 		return ""
@ -327,7 +449,54 @@ func readStringTag(data []byte, group, elem uint16) string {
 	vr := string(data[pos+4 : pos+6])
 	var length uint16
 	var valPos int
-	if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" {
+	if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" || vr == "OD" || vr == "UC" || vr == "UT" {
 		length = uint16(binary.LittleEndian.Uint32(data[pos+8 : pos+12]))
 		valPos = pos + 12
 	} else {
 		length = binary.LittleEndian.Uint16(data[pos+6 : pos+8])
 		valPos = pos + 8
 	}
 	if valPos+int(length) > len(data) {
 		return ""
 	}
 	raw := data[valPos : valPos+int(length)]
 	return strings.TrimRight(string(raw), " \x00")
 }
 func readStringTag(data []byte, group, elem uint16) string {
 	pos := findTag(data, group, elem)
 	if pos < 0 {
 		return ""
 	}
 	// Check for implicit VR by validating the VR field
 	implicitVR := !isValidVR(data, pos+4)
 	if implicitVR {
 		// Implicit VR: tag (4) + length (4) + value
 		length := binary.LittleEndian.Uint32(data[pos+4 : pos+8])
 		valPos := pos + 8
 		if valPos+int(length) > len(data) {
 			return ""
 		}
 		raw := data[valPos : valPos+int(length)]
 		var s string
 		if utf8.Valid(raw) {
 			s = string(raw)
 		} else {
 			runes := make([]rune, len(raw))
 			for i, b := range raw {
 				runes[i] = rune(b)
 			}
 			s = string(runes)
 		}
 		return strings.TrimRight(s, " \x00")
 	}
 	// Explicit VR path
 	vr := string(data[pos+4 : pos+6])
 	var length uint16
 	var valPos int
 	if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" || vr == "OD" || vr == "UC" || vr == "UT" {
 		length = uint16(binary.LittleEndian.Uint32(data[pos+8 : pos+12]))
 		valPos = pos + 12
 	} else {
@ -366,6 +535,8 @@ func readIntTagSmart(data []byte, group, elem uint16) int {
 	if pos < 0 {
 		return 0
 	}
 	// Check for implicit VR before reading VR field
 	if isValidVR(data, pos+4) {
 		vr := string(data[pos+4 : pos+6])
 		if vr == "US" || vr == "SS" {
 			valPos := pos + 8
@ -373,6 +544,8 @@ func readIntTagSmart(data []byte, group, elem uint16) int {
 				return int(binary.LittleEndian.Uint16(data[valPos : valPos+2]))
 			}
 		}
 	}
 	// For implicit VR or non-integer VRs, fall back to string parsing
 	s := strings.TrimSpace(readStringTag(data, group, elem))
 	n, _ := strconv.Atoi(s)
 	return n