fix: replace naive byte-scan findTag with DICOM stream walker

The old findTag scanned raw bytes for the 4-byte tag pattern, which
caused false matches inside Siemens CSA private OB blobs (e.g. the
large 0029,1020 Series Header). This corrupted body_part and other
fields on Siemens MAGNETOM Sola MRIs because findTag(0x0018, 0x0015)
hit a matching byte sequence inside the binary payload before reaching
the real BodyPartExamined element.

Fix: walkToTag() walks the DICOM element stream sequentially, reading
VR and length fields to skip element values entirely. Falls back to
byte-scan only on corrupt/truncated length fields. findLastTag updated
to use the same walker.
This commit is contained in:
James 2026-03-23 12:09:31 -04:00
parent 989969375d
commit bf57e28e71
1 changed files with 189 additions and 16 deletions

View File

@ -292,11 +292,78 @@ func safeExtractPath(destDir, name string) (string, error) {
// DICOM TAG READING // DICOM TAG READING
// ============================================================================ // ============================================================================
func findTag(data []byte, group, elem uint16) int { // walkToTag walks the DICOM element stream from startPos, respecting VR/length
// fields so it never matches tag bytes inside binary payloads (e.g. Siemens CSA
// OB blobs). Returns the byte offset of the matching element, or -1.
func walkToTag(data []byte, startPos int, group, elem uint16) int {
pos := startPos
n := len(data)
for pos+4 <= n {
if pos+4 > n {
break
}
g := binary.LittleEndian.Uint16(data[pos : pos+2])
e := binary.LittleEndian.Uint16(data[pos+2 : pos+4])
if g == group && e == elem {
return pos
}
// Determine value length to skip to next element.
// If we can't parse a sensible length, fall back to byte scan.
if pos+6 > n {
break
}
vr := string(data[pos+4 : pos+6])
var valLen uint32
var headerLen int
if isValidVR(data, pos+4) {
// Explicit VR
switch vr {
case "OB", "OW", "SQ", "UN", "OD", "UC", "UR", "UT":
// 4-byte reserved + 4-byte length
if pos+12 > n {
break
}
valLen = binary.LittleEndian.Uint32(data[pos+8 : pos+12])
headerLen = 12
default:
// 2-byte length
if pos+8 > n {
break
}
valLen = uint32(binary.LittleEndian.Uint16(data[pos+6 : pos+8]))
headerLen = 8
}
} else {
// Implicit VR: tag(4) + length(4)
if pos+8 > n {
break
}
valLen = binary.LittleEndian.Uint32(data[pos+4 : pos+8])
headerLen = 8
}
// 0xFFFFFFFF = undefined length (SQ/item) — step past header only and
// let the inner loop find the sequence delimiter naturally.
if valLen == 0xFFFFFFFF {
pos += headerLen
} else {
next := pos + headerLen + int(valLen)
if next <= pos || next > n {
// Corrupt/truncated length — fall back to byte-scan from here
return findTagBytes(data, pos+1, group, elem)
}
pos = next
}
}
return -1
}
// findTagBytes is the original byte-scan fallback used only when the stream
// walker cannot continue (corrupt length field).
func findTagBytes(data []byte, startPos int, group, elem uint16) int {
target := make([]byte, 4) target := make([]byte, 4)
binary.LittleEndian.PutUint16(target[0:2], group) binary.LittleEndian.PutUint16(target[0:2], group)
binary.LittleEndian.PutUint16(target[2:4], elem) binary.LittleEndian.PutUint16(target[2:4], elem)
for i := 0; i < len(data)-4; i++ { for i := startPos; i < len(data)-4; i++ {
if data[i] == target[0] && data[i+1] == target[1] && if data[i] == target[0] && data[i+1] == target[1] &&
data[i+2] == target[2] && data[i+3] == target[3] { data[i+2] == target[2] && data[i+3] == target[3] {
return i return i
@ -305,21 +372,76 @@ func findTag(data []byte, group, elem uint16) int {
return -1 return -1
} }
func findLastTag(data []byte, group, elem uint16) int { // findTag finds the first occurrence of a DICOM tag, using the stream walker
target := make([]byte, 4) // starting from the DICOM preamble offset (128-byte preamble + 4-byte DICM).
binary.LittleEndian.PutUint16(target[0:2], group) func findTag(data []byte, group, elem uint16) int {
binary.LittleEndian.PutUint16(target[2:4], elem) // DICOM files start with 128-byte preamble + "DICM" magic.
lastPos := -1 // Meta header (group 0x0002) always lives there; for the main dataset
for i := 0; i < len(data)-4; i++ { // we start the walk right after the preamble when present.
if data[i] == target[0] && data[i+1] == target[1] && startPos := 0
data[i+2] == target[2] && data[i+3] == target[3] { if len(data) >= 132 && string(data[128:132]) == "DICM" {
lastPos = i startPos = 132
// For meta-header tags (group 0x0002), walk from 132.
// For dataset tags, also walk from 132 — the walker handles both.
} }
result := walkToTag(data, startPos, group, elem)
if result < 0 && startPos > 0 {
// Retry from byte 0 for edge cases (no preamble, raw DICOM)
result = walkToTag(data, 0, group, elem)
}
return result
}
func findLastTag(data []byte, group, elem uint16) int {
// Walk the full stream and keep the last match position.
startPos := 0
if len(data) >= 132 && string(data[128:132]) == "DICM" {
startPos = 132
}
lastPos := -1
pos := startPos
for {
found := walkToTag(data, pos, group, elem)
if found < 0 {
break
}
lastPos = found
pos = found + 1 // advance past this match to find a later one
} }
return lastPos return lastPos
} }
func readStringTag(data []byte, group, elem uint16) string { // isValidVR checks if the 2 bytes at offset look like a valid DICOM VR
func isValidVR(data []byte, offset int) bool {
if offset+2 > len(data) {
return false
}
vr := string(data[offset : offset+2])
validVRs := map[string]bool{
"AE": true, "AS": true, "AT": true, "CS": true, "DA": true, "DS": true, "DT": true,
"FL": true, "FD": true, "IS": true, "LO": true, "LT": true, "OB": true, "OD": true,
"OF": true, "OW": true, "PN": true, "SH": true, "SL": true, "SQ": true, "SS": true,
"ST": true, "TM": true, "UC": true, "UI": true, "UL": true, "UN": true, "UR": true,
"US": true, "UT": true,
}
return validVRs[vr]
}
// isImplicitVR returns true if transfer syntax uses implicit VR (no VR field in data elements)
func isImplicitVR(data []byte) bool {
// Check transfer syntax UID from file meta info (group 0x0002)
ts := readStringTagExplicit(data, 0x0002, 0x0010)
if ts == "" {
// No transfer syntax specified - default to Explicit VR Little Endian
return false
}
// Implicit VR Little Endian: 1.2.840.10008.1.2
// Also check for Siemens private implicit VR variants
return ts == "1.2.840.10008.1.2" || strings.Contains(ts, "1.2.276.0.7230010")
}
// readStringTagExplicit reads with explicit VR assumption (for meta-header)
func readStringTagExplicit(data []byte, group, elem uint16) string {
pos := findTag(data, group, elem) pos := findTag(data, group, elem)
if pos < 0 { if pos < 0 {
return "" return ""
@ -327,7 +449,54 @@ func readStringTag(data []byte, group, elem uint16) string {
vr := string(data[pos+4 : pos+6]) vr := string(data[pos+4 : pos+6])
var length uint16 var length uint16
var valPos int var valPos int
if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" { if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" || vr == "OD" || vr == "UC" || vr == "UT" {
length = uint16(binary.LittleEndian.Uint32(data[pos+8 : pos+12]))
valPos = pos + 12
} else {
length = binary.LittleEndian.Uint16(data[pos+6 : pos+8])
valPos = pos + 8
}
if valPos+int(length) > len(data) {
return ""
}
raw := data[valPos : valPos+int(length)]
return strings.TrimRight(string(raw), " \x00")
}
func readStringTag(data []byte, group, elem uint16) string {
pos := findTag(data, group, elem)
if pos < 0 {
return ""
}
// Check for implicit VR by validating the VR field
implicitVR := !isValidVR(data, pos+4)
if implicitVR {
// Implicit VR: tag (4) + length (4) + value
length := binary.LittleEndian.Uint32(data[pos+4 : pos+8])
valPos := pos + 8
if valPos+int(length) > len(data) {
return ""
}
raw := data[valPos : valPos+int(length)]
var s string
if utf8.Valid(raw) {
s = string(raw)
} else {
runes := make([]rune, len(raw))
for i, b := range raw {
runes[i] = rune(b)
}
s = string(runes)
}
return strings.TrimRight(s, " \x00")
}
// Explicit VR path
vr := string(data[pos+4 : pos+6])
var length uint16
var valPos int
if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" || vr == "OD" || vr == "UC" || vr == "UT" {
length = uint16(binary.LittleEndian.Uint32(data[pos+8 : pos+12])) length = uint16(binary.LittleEndian.Uint32(data[pos+8 : pos+12]))
valPos = pos + 12 valPos = pos + 12
} else { } else {
@ -366,6 +535,8 @@ func readIntTagSmart(data []byte, group, elem uint16) int {
if pos < 0 { if pos < 0 {
return 0 return 0
} }
// Check for implicit VR before reading VR field
if isValidVR(data, pos+4) {
vr := string(data[pos+4 : pos+6]) vr := string(data[pos+4 : pos+6])
if vr == "US" || vr == "SS" { if vr == "US" || vr == "SS" {
valPos := pos + 8 valPos := pos + 8
@ -373,6 +544,8 @@ func readIntTagSmart(data []byte, group, elem uint16) int {
return int(binary.LittleEndian.Uint16(data[valPos : valPos+2])) return int(binary.LittleEndian.Uint16(data[valPos : valPos+2]))
} }
} }
}
// For implicit VR or non-integer VRs, fall back to string parsing
s := strings.TrimSpace(readStringTag(data, group, elem)) s := strings.TrimSpace(readStringTag(data, group, elem))
n, _ := strconv.Atoi(s) n, _ := strconv.Atoi(s)
return n return n