fix: replace naive byte-scan findTag with DICOM stream walker
The old findTag scanned raw bytes for the 4-byte tag pattern, which caused false matches inside Siemens CSA private OB blobs (e.g. the large 0029,1020 Series Header). This corrupted body_part and other fields on Siemens MAGNETOM Sola MRIs because findTag(0x0018, 0x0015) hit a matching byte sequence inside the binary payload before reaching the real BodyPartExamined element. Fix: walkToTag() walks the DICOM element stream sequentially, reading VR and length fields to skip element values entirely. Falls back to byte-scan only on corrupt/truncated length fields. findLastTag updated to use the same walker.
This commit is contained in:
parent
989969375d
commit
bf57e28e71
199
lib/dicom.go
199
lib/dicom.go
|
|
@ -292,11 +292,78 @@ func safeExtractPath(destDir, name string) (string, error) {
|
|||
// DICOM TAG READING
|
||||
// ============================================================================
|
||||
|
||||
func findTag(data []byte, group, elem uint16) int {
|
||||
// walkToTag walks the DICOM element stream from startPos, respecting VR/length
|
||||
// fields so it never matches tag bytes inside binary payloads (e.g. Siemens CSA
|
||||
// OB blobs). Returns the byte offset of the matching element, or -1.
|
||||
func walkToTag(data []byte, startPos int, group, elem uint16) int {
|
||||
pos := startPos
|
||||
n := len(data)
|
||||
for pos+4 <= n {
|
||||
if pos+4 > n {
|
||||
break
|
||||
}
|
||||
g := binary.LittleEndian.Uint16(data[pos : pos+2])
|
||||
e := binary.LittleEndian.Uint16(data[pos+2 : pos+4])
|
||||
if g == group && e == elem {
|
||||
return pos
|
||||
}
|
||||
// Determine value length to skip to next element.
|
||||
// If we can't parse a sensible length, fall back to byte scan.
|
||||
if pos+6 > n {
|
||||
break
|
||||
}
|
||||
vr := string(data[pos+4 : pos+6])
|
||||
var valLen uint32
|
||||
var headerLen int
|
||||
if isValidVR(data, pos+4) {
|
||||
// Explicit VR
|
||||
switch vr {
|
||||
case "OB", "OW", "SQ", "UN", "OD", "UC", "UR", "UT":
|
||||
// 4-byte reserved + 4-byte length
|
||||
if pos+12 > n {
|
||||
break
|
||||
}
|
||||
valLen = binary.LittleEndian.Uint32(data[pos+8 : pos+12])
|
||||
headerLen = 12
|
||||
default:
|
||||
// 2-byte length
|
||||
if pos+8 > n {
|
||||
break
|
||||
}
|
||||
valLen = uint32(binary.LittleEndian.Uint16(data[pos+6 : pos+8]))
|
||||
headerLen = 8
|
||||
}
|
||||
} else {
|
||||
// Implicit VR: tag(4) + length(4)
|
||||
if pos+8 > n {
|
||||
break
|
||||
}
|
||||
valLen = binary.LittleEndian.Uint32(data[pos+4 : pos+8])
|
||||
headerLen = 8
|
||||
}
|
||||
// 0xFFFFFFFF = undefined length (SQ/item) — step past header only and
|
||||
// let the inner loop find the sequence delimiter naturally.
|
||||
if valLen == 0xFFFFFFFF {
|
||||
pos += headerLen
|
||||
} else {
|
||||
next := pos + headerLen + int(valLen)
|
||||
if next <= pos || next > n {
|
||||
// Corrupt/truncated length — fall back to byte-scan from here
|
||||
return findTagBytes(data, pos+1, group, elem)
|
||||
}
|
||||
pos = next
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// findTagBytes is the original byte-scan fallback used only when the stream
|
||||
// walker cannot continue (corrupt length field).
|
||||
func findTagBytes(data []byte, startPos int, group, elem uint16) int {
|
||||
target := make([]byte, 4)
|
||||
binary.LittleEndian.PutUint16(target[0:2], group)
|
||||
binary.LittleEndian.PutUint16(target[2:4], elem)
|
||||
for i := 0; i < len(data)-4; i++ {
|
||||
for i := startPos; i < len(data)-4; i++ {
|
||||
if data[i] == target[0] && data[i+1] == target[1] &&
|
||||
data[i+2] == target[2] && data[i+3] == target[3] {
|
||||
return i
|
||||
|
|
@ -305,21 +372,76 @@ func findTag(data []byte, group, elem uint16) int {
|
|||
return -1
|
||||
}
|
||||
|
||||
func findLastTag(data []byte, group, elem uint16) int {
|
||||
target := make([]byte, 4)
|
||||
binary.LittleEndian.PutUint16(target[0:2], group)
|
||||
binary.LittleEndian.PutUint16(target[2:4], elem)
|
||||
lastPos := -1
|
||||
for i := 0; i < len(data)-4; i++ {
|
||||
if data[i] == target[0] && data[i+1] == target[1] &&
|
||||
data[i+2] == target[2] && data[i+3] == target[3] {
|
||||
lastPos = i
|
||||
// findTag finds the first occurrence of a DICOM tag, using the stream walker
|
||||
// starting from the DICOM preamble offset (128-byte preamble + 4-byte DICM).
|
||||
func findTag(data []byte, group, elem uint16) int {
|
||||
// DICOM files start with 128-byte preamble + "DICM" magic.
|
||||
// Meta header (group 0x0002) always lives there; for the main dataset
|
||||
// we start the walk right after the preamble when present.
|
||||
startPos := 0
|
||||
if len(data) >= 132 && string(data[128:132]) == "DICM" {
|
||||
startPos = 132
|
||||
// For meta-header tags (group 0x0002), walk from 132.
|
||||
// For dataset tags, also walk from 132 — the walker handles both.
|
||||
}
|
||||
result := walkToTag(data, startPos, group, elem)
|
||||
if result < 0 && startPos > 0 {
|
||||
// Retry from byte 0 for edge cases (no preamble, raw DICOM)
|
||||
result = walkToTag(data, 0, group, elem)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func findLastTag(data []byte, group, elem uint16) int {
|
||||
// Walk the full stream and keep the last match position.
|
||||
startPos := 0
|
||||
if len(data) >= 132 && string(data[128:132]) == "DICM" {
|
||||
startPos = 132
|
||||
}
|
||||
lastPos := -1
|
||||
pos := startPos
|
||||
for {
|
||||
found := walkToTag(data, pos, group, elem)
|
||||
if found < 0 {
|
||||
break
|
||||
}
|
||||
lastPos = found
|
||||
pos = found + 1 // advance past this match to find a later one
|
||||
}
|
||||
return lastPos
|
||||
}
|
||||
|
||||
func readStringTag(data []byte, group, elem uint16) string {
|
||||
// isValidVR checks if the 2 bytes at offset look like a valid DICOM VR
|
||||
func isValidVR(data []byte, offset int) bool {
|
||||
if offset+2 > len(data) {
|
||||
return false
|
||||
}
|
||||
vr := string(data[offset : offset+2])
|
||||
validVRs := map[string]bool{
|
||||
"AE": true, "AS": true, "AT": true, "CS": true, "DA": true, "DS": true, "DT": true,
|
||||
"FL": true, "FD": true, "IS": true, "LO": true, "LT": true, "OB": true, "OD": true,
|
||||
"OF": true, "OW": true, "PN": true, "SH": true, "SL": true, "SQ": true, "SS": true,
|
||||
"ST": true, "TM": true, "UC": true, "UI": true, "UL": true, "UN": true, "UR": true,
|
||||
"US": true, "UT": true,
|
||||
}
|
||||
return validVRs[vr]
|
||||
}
|
||||
|
||||
// isImplicitVR returns true if transfer syntax uses implicit VR (no VR field in data elements)
|
||||
func isImplicitVR(data []byte) bool {
|
||||
// Check transfer syntax UID from file meta info (group 0x0002)
|
||||
ts := readStringTagExplicit(data, 0x0002, 0x0010)
|
||||
if ts == "" {
|
||||
// No transfer syntax specified - default to Explicit VR Little Endian
|
||||
return false
|
||||
}
|
||||
// Implicit VR Little Endian: 1.2.840.10008.1.2
|
||||
// Also check for Siemens private implicit VR variants
|
||||
return ts == "1.2.840.10008.1.2" || strings.Contains(ts, "1.2.276.0.7230010")
|
||||
}
|
||||
|
||||
// readStringTagExplicit reads with explicit VR assumption (for meta-header)
|
||||
func readStringTagExplicit(data []byte, group, elem uint16) string {
|
||||
pos := findTag(data, group, elem)
|
||||
if pos < 0 {
|
||||
return ""
|
||||
|
|
@ -327,7 +449,54 @@ func readStringTag(data []byte, group, elem uint16) string {
|
|||
vr := string(data[pos+4 : pos+6])
|
||||
var length uint16
|
||||
var valPos int
|
||||
if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" {
|
||||
if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" || vr == "OD" || vr == "UC" || vr == "UT" {
|
||||
length = uint16(binary.LittleEndian.Uint32(data[pos+8 : pos+12]))
|
||||
valPos = pos + 12
|
||||
} else {
|
||||
length = binary.LittleEndian.Uint16(data[pos+6 : pos+8])
|
||||
valPos = pos + 8
|
||||
}
|
||||
if valPos+int(length) > len(data) {
|
||||
return ""
|
||||
}
|
||||
raw := data[valPos : valPos+int(length)]
|
||||
return strings.TrimRight(string(raw), " \x00")
|
||||
}
|
||||
|
||||
func readStringTag(data []byte, group, elem uint16) string {
|
||||
pos := findTag(data, group, elem)
|
||||
if pos < 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Check for implicit VR by validating the VR field
|
||||
implicitVR := !isValidVR(data, pos+4)
|
||||
if implicitVR {
|
||||
// Implicit VR: tag (4) + length (4) + value
|
||||
length := binary.LittleEndian.Uint32(data[pos+4 : pos+8])
|
||||
valPos := pos + 8
|
||||
if valPos+int(length) > len(data) {
|
||||
return ""
|
||||
}
|
||||
raw := data[valPos : valPos+int(length)]
|
||||
var s string
|
||||
if utf8.Valid(raw) {
|
||||
s = string(raw)
|
||||
} else {
|
||||
runes := make([]rune, len(raw))
|
||||
for i, b := range raw {
|
||||
runes[i] = rune(b)
|
||||
}
|
||||
s = string(runes)
|
||||
}
|
||||
return strings.TrimRight(s, " \x00")
|
||||
}
|
||||
|
||||
// Explicit VR path
|
||||
vr := string(data[pos+4 : pos+6])
|
||||
var length uint16
|
||||
var valPos int
|
||||
if vr == "OB" || vr == "OW" || vr == "SQ" || vr == "UN" || vr == "OD" || vr == "UC" || vr == "UT" {
|
||||
length = uint16(binary.LittleEndian.Uint32(data[pos+8 : pos+12]))
|
||||
valPos = pos + 12
|
||||
} else {
|
||||
|
|
@ -366,6 +535,8 @@ func readIntTagSmart(data []byte, group, elem uint16) int {
|
|||
if pos < 0 {
|
||||
return 0
|
||||
}
|
||||
// Check for implicit VR before reading VR field
|
||||
if isValidVR(data, pos+4) {
|
||||
vr := string(data[pos+4 : pos+6])
|
||||
if vr == "US" || vr == "SS" {
|
||||
valPos := pos + 8
|
||||
|
|
@ -373,6 +544,8 @@ func readIntTagSmart(data []byte, group, elem uint16) int {
|
|||
return int(binary.LittleEndian.Uint16(data[valPos : valPos+2]))
|
||||
}
|
||||
}
|
||||
}
|
||||
// For implicit VR or non-integer VRs, fall back to string parsing
|
||||
s := strings.TrimSpace(readStringTag(data, group, elem))
|
||||
n, _ := strconv.Atoi(s)
|
||||
return n
|
||||
|
|
|
|||
Loading…
Reference in New Issue