Skip to content

Commit 670c261

Browse files
authored
fix(xmp): Correctly parse XMP with self-closing elements (#5106)
decode_xmp() only searched for `</rdf:Description>` end tags, so self-closing `<rdf:Description ... />` elements (valid XML) were silently skipped. Now detect both `/>` and `</rdf:Description>` endings, plus simplify quite a bit generally, by rewriting decode_xmp to use pugixml more thoroughly to do the whole parse job instead of trying to hunt for tags ourselves. Fixes #5021 Assisted-by: Claude Code / Opus 4.6 --------- Signed-off-by: Larry Gritz <lg@larrygritz.com>
1 parent f0aab06 commit 670c261

1 file changed

Lines changed: 26 additions & 45 deletions

File tree

src/libOpenImageIO/xmp.cpp

Lines changed: 26 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -413,25 +413,6 @@ add_attrib(ImageSpec& spec, string_view xmlname, string_view xmlvalue,
413413

414414

415415

416-
// Utility: Search str for the first substring in str (starting from
417-
// position pos) that starts with startmarker and ends with endmarker.
418-
// If not found, return false. If found, return true, store the
419-
// beginning and ending indices in startpos and endpos.
420-
static bool
421-
extract_middle(string_view str, size_t pos, string_view startmarker,
422-
string_view endmarker, size_t& startpos, size_t& endpos)
423-
{
424-
startpos = str.find(startmarker, pos);
425-
if (startpos == std::string::npos)
426-
return false; // start marker not found
427-
endpos = str.find(endmarker, startpos);
428-
if (endpos == std::string::npos)
429-
return false; // end marker not found
430-
endpos += endmarker.size();
431-
return true;
432-
}
433-
434-
435416
// Decode one XMP node and its children.
436417
// Return value is the size of the resulting attribute (can be used to
437418
// catch runaway or corrupt XML).
@@ -541,35 +522,35 @@ decode_xmp(string_view xml, ImageSpec& spec)
541522
#endif
542523
if (!xml.length())
543524
return true;
544-
for (size_t startpos = 0, endpos = 0;
545-
extract_middle(xml, endpos, "<rdf:Description", "</rdf:Description>",
546-
startpos, endpos);) {
547-
// Turn that middle section into an XML document
548-
string_view rdf = xml.substr(startpos, endpos - startpos); // scooch in
549-
#if DEBUG_XMP_READ
550-
std::cerr << "RDF is:\n---\n" << rdf.substr(0, 4096) << "\n---\n";
551-
#endif
552-
pugi::xml_document doc;
553-
pugi::xml_parse_result parse_result
554-
= doc.load_buffer(rdf.data(), rdf.size(),
555-
pugi::parse_default | pugi::parse_fragment);
556-
if (!parse_result) {
525+
// Some callers (e.g. JPEG APP1 markers) prepend a namespace URI before the
526+
// actual XML. Skip any leading non-XML content.
527+
auto xmlstart = xml.find('<');
528+
if (xmlstart == string_view::npos)
529+
return true;
530+
xml = xml.substr(xmlstart);
531+
pugi::xml_document doc;
532+
pugi::xml_parse_result parse_result
533+
= doc.load_buffer(xml.data(), xml.size(),
534+
pugi::parse_default | pugi::parse_fragment);
535+
if (!parse_result) {
557536
#if DEBUG_XMP_READ
558-
std::cerr << "Error parsing XML @" << parse_result.offset << ": "
559-
<< parse_result.description() << "\n";
537+
std::cerr << "Error parsing XML @" << parse_result.offset << ": "
538+
<< parse_result.description() << "\n";
560539
#endif
561-
// Instead of returning early here if there were errors parsing
562-
// the XML -- I have noticed that very minor XML malformations
563-
// are common in XMP found in files -- hope for the best and
564-
// go ahead and assume that maybe it managed to put something
565-
// useful in the resulting document.
566-
#if 0
567-
return true;
568-
#endif
569-
}
570-
// Decode the contents of the XML document (it will recurse)
571-
decode_xmp_node(doc.first_child(), spec);
540+
// Instead of returning early here if there were errors parsing
541+
// the XML -- I have noticed that very minor XML malformations
542+
// are common in XMP found in files -- hope for the best and
543+
// go ahead and assume that maybe it managed to put something
544+
// useful in the resulting document.
572545
}
546+
// Find the first rdf:Description node anywhere in the document.
547+
// decode_xmp_node() iterates siblings, so passing the first one processes
548+
// all rdf:Description siblings (i.e. all blocks in the rdf:RDF container).
549+
auto first_desc = doc.find_node([](pugi::xml_node n) {
550+
return strcmp(n.name(), "rdf:Description") == 0;
551+
});
552+
if (first_desc)
553+
decode_xmp_node(first_desc, spec);
573554
#if DEBUG_XMP_READ
574555
std::cerr << "XMP total parse time " << timer() << "\n";
575556
#endif

0 commit comments

Comments
 (0)