From c6be704344477d5a6deeda433b981d4049f2e1ee Mon Sep 17 00:00:00 2001 From: alfsb Date: Mon, 20 Jan 2025 11:50:24 -0300 Subject: [PATCH] New tool for detecting broken XML files (#211) --- scripts/broken.php | 154 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 scripts/broken.php diff --git a/scripts/broken.php b/scripts/broken.php new file mode 100644 index 000000000..82b0d1cda --- /dev/null +++ b/scripts/broken.php @@ -0,0 +1,154 @@ + | ++----------------------------------------------------------------------+ + +# Description + +This command line utility test if an file is valid standalone XML file, +accepting undefined entities references. If an directory is informed, +the test is applied in all .xml files in directory and sub directories. + +This tool also cares for directories marked with .xmlfragmentdir, so +theses files are tested in relaxed semantics for XML fragments. */ + +ini_set( 'display_errors' , 1 ); +ini_set( 'display_startup_errors' , 1 ); +error_reporting( E_ALL ); + +if ( count( $argv ) < 2 ) + print_usage_exit( $argv[0] ); + +array_shift( $argv ); +foreach( $argv as $arg ) +{ + if ( file_exists( $arg ) ) + { + if ( is_file( $arg ) ) + testFile( $arg ); + if ( is_dir( $arg ) ) + testDir( $arg ); + continue; + } + echo "Path does not exist: $arg\n"; +} + +function print_usage_exit( $cmd ) +{ + fwrite( STDERR , " Wrong paramater count. Usage:\n" ); + fwrite( STDERR , " {$cmd} path:\n" ); + exit; +} + +function setup( string & $prefix , string & $suffix , string & $extra ) +{ + // Undefined entities generate TWO different error messages on libxml + // - "Entity '?' not defined" (for entity inside elements) + // - "Extra content at the end of the document" (entity outside elements) + + $inside = "&ZZZ;"; + $outside = "&ZZZ;"; + + $doc = new DOMDocument(); + $doc->recover = true; + $doc->resolveExternals = false; + $doc->substituteEntities = false; + libxml_use_internal_errors( true ); + + $doc->loadXML( $inside ); + $message = trim( libxml_get_errors()[0]->message ); + $message = str_replace( "ZZZ" , "\f" , $message ); + [ $prefix , $suffix ] = explode( "\f" , $message ); + libxml_clear_errors(); + + $doc->loadXML( $outside ); + $extra = trim( libxml_get_errors()[0]->message ); + libxml_clear_errors(); +} + +function testFile( string $filename , bool $fragment = false ) +{ + static $prefix = "", $suffix = "", $extra = ""; + if ( $extra == "" ) + setup( $prefix , $suffix , $extra ); + + $doc = new DOMDocument(); + $doc->recover = true; + $doc->resolveExternals = false; + $doc->substituteEntities = false; + libxml_use_internal_errors( true ); + + $contents = file_get_contents( $filename ); + if ( $fragment ) + $contents = "{$contents}"; + $doc->loadXML( $contents ); + + $errors = libxml_get_errors(); + libxml_clear_errors(); + + foreach( $errors as $error ) + { + $message = trim( $error->message ); + $hintFragDir = false; + + if ( str_starts_with( $message , $prefix ) && str_ends_with( $message , $suffix ) ) + continue; + //if ( $message == $extra ) // Disabled as unnecessary. Also, this indicates that some + // continue; // some entity reference is used at an unusual position. + if ( $message == $extra ) + $hintFragDir = true; + + $lin = $error->line; + $col = $error->column; + echo "Broken XML file:\n"; + echo " Path: $filename [$lin,$col]\n"; + echo " Error: $message\n"; + if ( $hintFragDir ) + echo " Hint: Dir is marked with .xmlfragmentdir on doc-en? If not, check entity references.\n"; + echo "\n"; + return; + } +} + +function testDir( string $dir ) +{ + $dir = realpath( $dir ); + $files = scandir( $dir ); + $fragment = false; + $subdirs = []; + + foreach( $files as $file ) + { + if ( $file == ".xmlfragmentdir" ) + { + $fragment = true; + continue; + } + if ( $file[0] == "." ) + continue; + + $fullpath = realpath( "$dir/$file" ); + + if ( is_dir ( $fullpath ) ) + { + $subdirs[] = $fullpath; + continue; + } + + if ( str_ends_with( $fullpath , ".xml" ) ) + testFile( $fullpath , $fragment ); + } + + foreach( $subdirs as $dir ) + testDir( $dir ); +}