From 7495d18b2c5cfc322cec347f9f9fcc7a28e774a8 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 24 Oct 2024 14:57:05 +0900 Subject: [PATCH 01/45] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 0fbd5eb2..42623b08 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.9" + VERSION = "3.4.0" REVISION = "" Copyright = COPYRIGHT From 6a8c041d825c7d16e76b056bd63b20edb92febc6 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 24 Oct 2024 15:15:39 +0900 Subject: [PATCH 02/45] test jruby: omit fragile test --- test/parse/test_element.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index ab4818da..f07a7d5a 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -133,6 +133,7 @@ def test_linear_performance_attribute_value_gt end def test_linear_performance_deep_same_name_attributes + omit("This is fragile on JRuby") if RUBY_ENGINE == "jruby" seq = [100, 500, 1000, 1500, 2000] assert_linear_performance(seq, rehearsal: 10) do |n| xml = <<-XML From 8ef75024b96d3e5279b39fdd1692821cdbcd84b5 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 27 Oct 2024 15:02:18 +0900 Subject: [PATCH 03/45] Add `IOSource#match?` method (#216) ## Why? `StringScanner#match?` is faster than `StringScanner#check`. See: https://github.com/ruby/strscan/pull/111 ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 18.819 19.362 32.846 34.708 i/s - 100.000 times in 5.313905s 5.164791s 3.044500s 2.881200s sax 28.188 29.982 48.386 52.554 i/s - 100.000 times in 3.547597s 3.335304s 2.066732s 1.902809s pull 31.962 33.902 57.868 60.662 i/s - 100.000 times in 3.128689s 2.949690s 1.728071s 1.648467s stream 31.436 33.030 52.808 56.647 i/s - 100.000 times in 3.181095s 3.027574s 1.893635s 1.765304s Comparison: dom after(YJIT): 34.7 i/s before(YJIT): 32.8 i/s - 1.06x slower after: 19.4 i/s - 1.79x slower before: 18.8 i/s - 1.84x slower sax after(YJIT): 52.6 i/s before(YJIT): 48.4 i/s - 1.09x slower after: 30.0 i/s - 1.75x slower before: 28.2 i/s - 1.86x slower pull after(YJIT): 60.7 i/s before(YJIT): 57.9 i/s - 1.05x slower after: 33.9 i/s - 1.79x slower before: 32.0 i/s - 1.90x slower stream after(YJIT): 56.6 i/s before(YJIT): 52.8 i/s - 1.07x slower after: 33.0 i/s - 1.72x slower before: 31.4 i/s - 1.80x slower ``` - YJIT=ON : 1.05x - 1.09x faster - YJIT=OFF : 1.02x - 1.06x faster --------- Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 84 ++++++++++++++++----------------- lib/rexml/source.rb | 35 ++++++++++++++ 2 files changed, 77 insertions(+), 42 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index b4547ba3..ff72ce44 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -269,10 +269,10 @@ def pull_event @source.ensure_buffer if @document_status == nil start_position = @source.position - if @source.match("/um, true) if md.nil? raise REXML::ParseException.new("Unclosed comment", @source) @@ -281,10 +281,10 @@ def pull_event raise REXML::ParseException.new("Malformed comment", @source) end return [ :comment, md[1] ] - elsif @source.match("DOCTYPE", true) + elsif @source.match?("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" - unless @source.match(/\s+/um, true) - if @source.match(">") + unless @source.match?(/\s+/um, true) + if @source.match?(">") message = "#{base_error_message}: name is missing" else message = "#{base_error_message}: invalid name" @@ -293,10 +293,10 @@ def pull_event raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) - if @source.match(/\s*\[/um, true) + if @source.match?(/\s*\[/um, true) id = [nil, nil, nil] @document_status = :in_doctype - elsif @source.match(/\s*>/um, true) + elsif @source.match?(/\s*>/um, true) id = [nil, nil, nil] @document_status = :after_doctype @source.ensure_buffer @@ -308,9 +308,9 @@ def pull_event # For backward compatibility id[1], id[2] = id[2], nil end - if @source.match(/\s*\[/um, true) + if @source.match?(/\s*\[/um, true) @document_status = :in_doctype - elsif @source.match(/\s*>/um, true) + elsif @source.match?(/\s*>/um, true) @document_status = :after_doctype @source.ensure_buffer else @@ -320,7 +320,7 @@ def pull_event end args = [:start_doctype, name, *id] if @document_status == :after_doctype - @source.match(/\s*/um, true) + @source.match?(/\s*/um, true) @stack << [ :end_doctype ] end return args @@ -331,14 +331,14 @@ def pull_event end end if @document_status == :in_doctype - @source.match(/\s*/um, true) # skip spaces + @source.match?(/\s*/um, true) # skip spaces start_position = @source.position - if @source.match("/um, true) raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, "") + unless @source.match?(/\s+/um, true) + if @source.match?(">") message = "#{base_error_message}: name is missing" else message = "#{base_error_message}: invalid name" @@ -405,7 +405,7 @@ def pull_event id = parse_id(base_error_message, accept_external_id: true, accept_public_id: true) - unless @source.match(/\s*>/um, true) + unless @source.match?(/\s*>/um, true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) end @@ -419,7 +419,7 @@ def pull_event end elsif match = @source.match(/(%.*?;)\s*/um, true) return [ :externalentity, match[1] ] - elsif @source.match(/\]\s*>/um, true) + elsif @source.match?(/\]\s*>/um, true) @document_status = :after_doctype return [ :end_doctype ] end @@ -428,16 +428,16 @@ def pull_event end end if @document_status == :after_doctype - @source.match(/\s*/um, true) + @source.match?(/\s*/um, true) end begin start_position = @source.position - if @source.match("<", true) + if @source.match?("<", true) # :text's read_until may remain only "<" in buffer. In the # case, buffer is empty here. So we need to fill buffer # here explicitly. @source.ensure_buffer - if @source.match("/", true) + if @source.match?("/", true) @namespaces_restore_stack.pop last_tag = @tags.pop md = @source.match(Private::CLOSE_PATTERN, true) @@ -452,7 +452,7 @@ def pull_event raise REXML::ParseException.new(message, @source) end return [ :end_element, last_tag ] - elsif @source.match("!", true) + elsif @source.match?("!", true) md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md @@ -470,7 +470,7 @@ def pull_event end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) - elsif @source.match("?", true) + elsif @source.match?("?", true) return process_instruction else # Get the next tag @@ -651,7 +651,7 @@ def need_source_encoding_update?(xml_declaration_encoding) def parse_name(base_error_message) md = @source.match(Private::NAME_PATTERN, true) unless md - if @source.match(/\S/um) + if @source.match?(/\S/um) message = "#{base_error_message}: invalid name" else message = "#{base_error_message}: name is missing" @@ -693,34 +693,34 @@ def parse_id_invalid_details(accept_external_id:, accept_public_id:) public = /\A\s*PUBLIC/um system = /\A\s*SYSTEM/um - if (accept_external_id or accept_public_id) and @source.match(/#{public}/um) - if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um) + if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um) + if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um) return "public ID literal is missing" end - unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um) + unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um) return "invalid public ID literal" end if accept_public_id - if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um) + if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um) return "system ID literal is missing" end - unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um) + unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um) return "invalid system literal" end "garbage after system literal" else "garbage after public ID literal" end - elsif accept_external_id and @source.match(/#{system}/um) - if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um) + elsif accept_external_id and @source.match?(/#{system}/um) + if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um) return "system literal is missing" end - unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um) + unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um) return "invalid system literal" end "garbage after system literal" else - unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um) + unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um) return "invalid ID type" end "ID type is missing" @@ -729,7 +729,7 @@ def parse_id_invalid_details(accept_external_id:, def process_instruction name = parse_name("Malformed XML: Invalid processing instruction node") - if @source.match(/\s+/um, true) + if @source.match?(/\s+/um, true) match_data = @source.match(/(.*?)\?>/um, true) unless match_data raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) @@ -737,7 +737,7 @@ def process_instruction content = match_data[1] else content = nil - unless @source.match("?>", true) + unless @source.match?("?>", true) raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) end end @@ -767,9 +767,9 @@ def parse_attributes(prefixes) expanded_names = {} closed = false while true - if @source.match(">", true) + if @source.match?(">", true) return attributes, closed - elsif @source.match("/>", true) + elsif @source.match?("/>", true) closed = true return attributes, closed elsif match = @source.match(QNAME, true) @@ -777,7 +777,7 @@ def parse_attributes(prefixes) prefix = match[2] local_part = match[3] - unless @source.match(/\s*=\s*/um, true) + unless @source.match?(/\s*=\s*/um, true) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end @@ -793,7 +793,7 @@ def parse_attributes(prefixes) message = "Missing attribute value end quote: <#{name}>: <#{quote}>" raise REXML::ParseException.new(message, @source) end - @source.match(/\s*/um, true) + @source.match?(/\s*/um, true) if prefix == "xmlns" if local_part == "xml" if value != Private::XML_PREFIXED_NAMESPACE diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index dc0b5323..27a6349a 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -18,6 +18,16 @@ def scan(pattern) pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) super(pattern) end + + def match?(pattern) + pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) + super(pattern) + end + + def skip(pattern) + pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) + super(pattern) + end end end using StringScannerCheckScanString @@ -126,6 +136,14 @@ def match(pattern, cons=false) end end + def match?(pattern, cons=false) + if cons + !@scanner.skip(pattern).nil? + else + !@scanner.match?(pattern).nil? + end + end + def position @scanner.pos end @@ -267,6 +285,23 @@ def match( pattern, cons=false ) md.nil? ? nil : @scanner end + def match?( pattern, cons=false ) + # To avoid performance issue, we need to increase bytes to read per scan + min_bytes = 1 + while true + if cons + n_matched_bytes = @scanner.skip(pattern) + else + n_matched_bytes = @scanner.match?(pattern) + end + return true if n_matched_bytes + return false if pattern.is_a?(String) + return false if @source.nil? + return false unless read(nil, min_bytes) + min_bytes *= 2 + end + end + def empty? super and ( @source.nil? || @source.eof? ) end From 519ae6c0f3c0b08101e7251fddcc941308d9be87 Mon Sep 17 00:00:00 2001 From: Kevin Ebaugh Date: Tue, 29 Oct 2024 20:20:16 -0400 Subject: [PATCH 04/45] Clarify variable name (#218) Co-authored-by: Olle Jonsson --- lib/rexml/source.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 27a6349a..9a9b8f5d 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -87,7 +87,7 @@ def initialize(arg, encoding=nil) detect_encoding end @line = 0 - @term_encord = {} + @encoded_terms = {} end # The current buffer (what we're going to read next) @@ -246,7 +246,7 @@ def read(term = nil, min_bytes = 1) def read_until(term) pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ - term = @term_encord[term] ||= encode(term) + term = @encoded_terms[term] ||= encode(term) until str = @scanner.scan_until(pattern) break if @source.nil? break if @source.eof? From ed9168e187c6c8a0cb244f863c841d8f8fc8effd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Duarte?= Date: Wed, 6 Nov 2024 00:06:51 +0000 Subject: [PATCH 05/45] Stop requiring stringio dynamically (#219) `SourceFactory::create_from(String)` will always run the `require 'stringio'` operation. This prevents a multi-threaded JRuby application from parsing xml on separate threads concurrently given that `require` will pass through a synchronized piece of code. An experiment in removing this `require` lead to a 10x performance improvement on 10 threads parsing incoming strings on xml. For more details see https://github.com/logstash-plugins/logstash-filter-xml/issues/83 --- lib/rexml/source.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 9a9b8f5d..655164f3 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -1,6 +1,7 @@ # coding: US-ASCII # frozen_string_literal: false +require "stringio" require "strscan" require_relative 'encoding' @@ -45,7 +46,6 @@ def SourceFactory::create_from(arg) arg.respond_to? :eof? IOSource.new(arg) elsif arg.respond_to? :to_str - require 'stringio' IOSource.new(StringIO.new(arg)) elsif arg.kind_of? Source arg From 20562ec7bb15226b1820d53e16770a446089026e Mon Sep 17 00:00:00 2001 From: Dmitry Pogrebnoy <39134692+DmitryPogrebnoy@users.noreply.github.com> Date: Fri, 8 Nov 2024 06:11:34 +0100 Subject: [PATCH 06/45] parser pull: Add support for reusing parser (#220) GitHub: Fix GH-214 This is for parsing XML documents stream. We can use one parser to parse multiple XML documents with this feature. Co-authored-by: Dmitry Pogrebnoy --- lib/rexml/parsers/baseparser.rb | 4 ++++ lib/rexml/parsers/pullparser.rb | 4 ++++ test/test_pullparser.rb | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index ff72ce44..90851bb1 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -181,6 +181,10 @@ def add_listener( listener ) def stream=( source ) @source = SourceFactory.create_from( source ) + reset + end + + def reset @closed = nil @have_root = false @document_status = nil diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb index a331eff5..e0b1e94d 100644 --- a/lib/rexml/parsers/pullparser.rb +++ b/lib/rexml/parsers/pullparser.rb @@ -93,6 +93,10 @@ def pull def unshift token @my_stack.unshift token end + + def reset + @parser.reset + end end # A parsing event. The contents of the event are accessed as an +Array?, diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index bdf8be17..4471df4b 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -156,6 +156,39 @@ def test_peek assert_equal( 0, names.length ) end + def test_reset + xml_chunks = [ + "First valid and complete message", + "Second valid and complete message", + "Third valid and complete message" + ] + + messages = [] + + IO.pipe do |reader, writer| + xml_chunks.each do |chunk| + writer.write(chunk) + end + writer.close + + parser = REXML::Parsers::PullParser.new(reader) + while parser.has_next? + parser.pull + message_text = parser.pull + messages << message_text[0] + parser.pull + parser.reset + end + end + + assert_equal( + messages, + ["First valid and complete message", + "Second valid and complete message", + "Third valid and complete message"] + ) + end + class EntityExpansionLimitTest < Test::Unit::TestCase class GeneralEntityTest < self def test_have_value From 963ccdf830d64044676830b87ef2b4189fbefe2b Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 19 Nov 2024 09:35:04 +0900 Subject: [PATCH 07/45] Fix error handling when parsing XML via IO.pipe (#221) ## Why? If via IO.pipe, `IOError` exception is not raised, but `Errno::ESPIPE` or `Errno::EPIPE` or `Errno::EINVAL` exception is raised. - CRuby ``` @er_source.pos #=> Errno::ESPIPE Exception: Illegal seek ``` - CRuby (Windows (Ruby 2.6 or earlier)) ``` @er_source.pos #=> Errno::EINVAL: Invalid argument ``` - JRuby ``` @er_source.pos #=> Errno::EPIPE: Broken pipe - No message available ``` Co-authored-by: Sutou Kouhei --- lib/rexml/source.rb | 2 +- test/parse/test_text.rb | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 655164f3..b0b89b71 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -321,7 +321,7 @@ def current_line rescue end @er_source.seek(pos) - rescue IOError + rescue IOError, SystemCallError pos = -1 line = -1 end diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb index bb208d47..eb6de0cb 100644 --- a/test/parse/test_text.rb +++ b/test/parse/test_text.rb @@ -6,10 +6,7 @@ class TestParseText < Test::Unit::TestCase class TestInvalid < self def test_text_only exception = assert_raise(REXML::ParseException) do - parser = REXML::Parsers::BaseParser.new('a') - while parser.has_next? - parser.pull - end + REXML::Parsers::BaseParser.new('a').pull end assert_equal(<<~DETAIL.chomp, exception.to_s) @@ -21,6 +18,25 @@ def test_text_only DETAIL end + def test_text_only_with_io_pipe + IO.pipe do |reader, writer| + writer.write('a') + writer.close + + exception = assert_raise(REXML::ParseException) do + REXML::Parsers::BaseParser.new(reader).pull + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Content at the start of the document (got 'a') + Line: -1 + Position: -1 + Last 80 unconsumed characters: + + DETAIL + end + end + def test_before_root exception = assert_raise(REXML::ParseException) do parser = REXML::Parsers::BaseParser.new('b') From 46dd810e001763e42ee08346e71c46dbc8cfa7fd Mon Sep 17 00:00:00 2001 From: Jun Aruga Date: Thu, 28 Nov 2024 02:24:27 +0100 Subject: [PATCH 08/45] test: Fix NameError: uninitialized constant REXML::Parsers::PullParser (#222) This commit fixes the following NameError when running only specific test file test/parse/test_character_reference.rb. ``` $ ruby test/parse/test_character_reference.rb -v Loaded suite test/parse/test_character_reference Started REXMLTests::TestParseCharacterReference: test_hex_precedding_zero: E =========================================================================================== Error: test_hex_precedding_zero(REXMLTests::TestParseCharacterReference): NameError: uninitialized constant REXML::Parsers::PullParser test/parse/test_character_reference.rb:18:in 'REXMLTests::TestParseCharacterReference#test_hex_precedding_zero' =========================================================================================== : (0.001470) test_linear_performance_many_preceding_zeros: .: (0.033970) Finished in 0.036068018 seconds. ------------------------------------------------------------------------------------------- 2 tests, 15 assertions, 0 failures, 1 errors, 0 pendings, 0 omissions, 0 notifications 50% passed ------------------------------------------------------------------------------------------- 55.45 tests/s, 415.88 assertions/s ``` --- test/parse/test_character_reference.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb index 4bb5da5c..bf2b1938 100644 --- a/test/parse/test_character_reference.rb +++ b/test/parse/test_character_reference.rb @@ -2,6 +2,7 @@ require "core_assertions" require "rexml/document" +require "rexml/parsers/pullparser" module REXMLTests class TestParseCharacterReference < Test::Unit::TestCase From 91305c191152107bd9ea8d026f3777e5f00c80b5 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 9 Dec 2024 09:58:05 +0900 Subject: [PATCH 09/45] Remove old code for Ruby 1.8 (#223) ## Why? `String#encode` is supported in Ruby 1.9 and later. --- lib/rexml/text.rb | 55 +++++++++++++---------------------------------- 1 file changed, 15 insertions(+), 40 deletions(-) diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 997f77d3..2bf480fb 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -29,31 +29,16 @@ class Text < Child (0x10000..0x10FFFF) ] - if String.method_defined? :encode - VALID_XML_CHARS = Regexp.new('^['+ - VALID_CHAR.map { |item| - case item - when Integer - [item].pack('U').force_encoding('utf-8') - when Range - [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8') - end - }.join + - ']*$') - else - VALID_XML_CHARS = /^( - [\x09\x0A\x0D\x20-\x7E] # ASCII - | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte - | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs - | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte - | \xEF[\x80-\xBE]{2} # - | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff - | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates - | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 - | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 - | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 - )*$/nx; - end + VALID_XML_CHARS = Regexp.new('^['+ + VALID_CHAR.map { |item| + case item + when Integer + [item].pack('U').force_encoding('utf-8') + when Range + [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8') + end + }.join + + ']*$') # Constructor # +arg+ if a String, the content is set to the String. If a Text, @@ -132,21 +117,11 @@ def Text.check string, pattern, doctype # illegal anywhere if !string.match?(VALID_XML_CHARS) - if String.method_defined? :encode - string.chars.each do |c| - case c.ord - when *VALID_CHAR - else - raise "Illegal character #{c.inspect} in raw string #{string.inspect}" - end - end - else - string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c| - case c.unpack('U') - when *VALID_CHAR - else - raise "Illegal character #{c.inspect} in raw string #{string.inspect}" - end + string.chars.each do |c| + case c.ord + when *VALID_CHAR + else + raise "Illegal character #{c.inspect} in raw string #{string.inspect}" end end end From dfc775343b3b4aec4c046b9df17bbe571612a861 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 15 Dec 2024 11:14:48 +0900 Subject: [PATCH 10/45] release: use Trusted Publishing --- .github/workflows/release.yml | 18 ++++++++++++++++++ Rakefile | 9 +++++++++ 2 files changed, 27 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 20ff87e7..76269f44 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -28,3 +28,21 @@ jobs: --title "${title}" env: GH_TOKEN: ${{ github.token }} + + rubygems: + name: RubyGems + runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + id-token: write + environment: release + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: ruby + bundler-cache: true + - uses: rubygems/configure-rubygems-credentials@v1.0.0 + - name: Push gems + run: | + bundle exec rake release:rubygem_push diff --git a/Rakefile b/Rakefile index 4676930b..8f42da1f 100644 --- a/Rakefile +++ b/Rakefile @@ -67,3 +67,12 @@ end desc "Run all benchmarks" task :benchmark => benchmark_tasks + +release_task = Rake.application["release"] +release_task.prerequisites.delete("build") +release_task.prerequisites.delete("release:rubygem_push") +release_task_comment = release_task.comment +if release_task_comment + release_task.clear_comments + release_task.comment = release_task_comment.gsub(/ and build.*$/, "") +end From 19d8ebfbcfb3816fdad050baa74da40d6f04ffac Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 15 Dec 2024 11:18:54 +0900 Subject: [PATCH 11/45] Add 3.4.0 entry --- NEWS.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/NEWS.md b/NEWS.md index 3d17c287..f25a33f2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,34 @@ # News +## 3.4.0 - 2024-12-15 {#version-3-4-0} + +### Improvement + + * Improved performance. + * GH-216 + * Patch by NAITOH Jun + + * JRuby: Improved parse performance. + * GH-219 + * Patch by João Duarte + + * Added support for reusing pull parser. + * GH-214 + * GH-220 + * Patch by Dmitry Pogrebnoy + + * Improved error handling when source is `IO`. + * GH-221 + * Patch by NAITOH Jun + +### Thanks + + * NAITOH Jun + + * João Duarte + + * Dmitry Pogrebnoy + ## 3.3.9 - 2024-10-24 {#version-3-3-9} ### Improvements From a1d875b23340df6b33d3bbe6b17cca807eb0e3d2 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 15 Dec 2024 11:19:55 +0900 Subject: [PATCH 12/45] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 42623b08..a653f028 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.4.0" + VERSION = "3.4.1" REVISION = "" Copyright = COPYRIGHT From bb0bedd25dbb69b247b0894a6c357f8903a2b9a2 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 19 Dec 2024 11:18:52 +0900 Subject: [PATCH 13/45] Optimize `IOSource#read_until` method by using `StringScanner#check_until(string)` (#226) ## Why? `StringScanner#check_until(string)` is faster than `StringScanner#check_until(regex)`. See: - https://github.com/ruby/strscan/pull/106 - https://github.com/ruby/strscan/pull/111 ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 19.459 19.840 35.035 35.786 i/s - 100.000 times in 5.139034s 5.040369s 2.854304s 2.794367s sax 30.057 30.026 52.986 53.716 i/s - 100.000 times in 3.326998s 3.330499s 1.887303s 1.861652s pull 33.777 34.415 62.294 64.020 i/s - 100.000 times in 2.960622s 2.905668s 1.605284s 1.562002s stream 33.789 34.003 60.174 60.411 i/s - 100.000 times in 2.959521s 2.940916s 1.661845s 1.655334s Comparison: dom after(YJIT): 35.8 i/s before(YJIT): 35.0 i/s - 1.02x slower after: 19.8 i/s - 1.80x slower before: 19.5 i/s - 1.84x slower sax after(YJIT): 53.7 i/s before(YJIT): 53.0 i/s - 1.01x slower before: 30.1 i/s - 1.79x slower after: 30.0 i/s - 1.79x slower pull after(YJIT): 64.0 i/s before(YJIT): 62.3 i/s - 1.03x slower after: 34.4 i/s - 1.86x slower before: 33.8 i/s - 1.90x slower stream after(YJIT): 60.4 i/s before(YJIT): 60.2 i/s - 1.00x slower after: 34.0 i/s - 1.78x slower before: 33.8 i/s - 1.79x slower ``` - YJIT=ON : 1.00x - 1.03x faster - YJIT=OFF : 1.00x - 1.02x faster --- lib/rexml/source.rb | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index b0b89b71..2409f76e 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -68,8 +68,14 @@ module Private SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} pre_defined_terms = ["'", '"', "<"] - pre_defined_terms.each do |term| - PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ + if StringScanner::Version < "3.1.1" + pre_defined_terms.each do |term| + PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ + end + else + pre_defined_terms.each do |term| + PRE_DEFINED_TERM_PATTERNS[term] = term + end end end private_constant :Private From b70388c2638d90ebd2ae471bd85239d8469b8e62 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 21 Dec 2024 07:59:47 +0900 Subject: [PATCH 14/45] Use `StringScanner#peek_byte` to get double or single quotation mark (#227) ## Why? `StringScanner#peek_byte` is fast, because it does not generate String object. ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 19.753 19.888 35.641 35.928 i/s - 100.000 times in 5.062402s 5.028121s 2.805792s 2.783339s sax 30.349 30.978 53.485 57.885 i/s - 100.000 times in 3.295012s 3.228103s 1.869671s 1.727567s pull 34.170 35.436 61.713 66.534 i/s - 100.000 times in 2.926534s 2.821955s 1.620404s 1.502996s stream 33.121 35.268 60.751 63.276 i/s - 100.000 times in 3.019222s 2.835443s 1.646065s 1.580374s Comparison: dom after(YJIT): 35.9 i/s before(YJIT): 35.6 i/s - 1.01x slower after: 19.9 i/s - 1.81x slower before: 19.8 i/s - 1.82x slower sax after(YJIT): 57.9 i/s before(YJIT): 53.5 i/s - 1.08x slower after: 31.0 i/s - 1.87x slower before: 30.3 i/s - 1.91x slower pull after(YJIT): 66.5 i/s before(YJIT): 61.7 i/s - 1.08x slower after: 35.4 i/s - 1.88x slower before: 34.2 i/s - 1.95x slower stream after(YJIT): 63.3 i/s before(YJIT): 60.8 i/s - 1.04x slower after: 35.3 i/s - 1.79x slower before: 33.1 i/s - 1.91x slower ``` - YJIT=ON : 1.01x - 1.08x faster - YJIT=OFF : 1.00x - 1.06x faster Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 22 ++++++++++++++++++++-- lib/rexml/source.rb | 8 ++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 90851bb1..13cdd821 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -766,6 +766,25 @@ def process_instruction [:processing_instruction, name, content] end + if StringScanner::Version < "3.1.1" + def scan_quote + @source.match(/(['"])/, true)&.[](1) + end + else + def scan_quote + case @source.peek_byte + when 34 # '"'.ord + @source.scan_byte + '"' + when 39 # "'".ord + @source.scan_byte + "'" + else + nil + end + end + end + def parse_attributes(prefixes) attributes = {} expanded_names = {} @@ -785,11 +804,10 @@ def parse_attributes(prefixes) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end - unless match = @source.match(/(['"])/, true) + unless quote = scan_quote message = "Missing attribute value start quote: <#{name}>" raise REXML::ParseException.new(message, @source) end - quote = match[1] start_position = @source.position value = @source.read_until(quote) unless value.chomp!(quote) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 2409f76e..5ba5ab12 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -158,6 +158,14 @@ def position=(pos) @scanner.pos = pos end + def peek_byte + @scanner.peek_byte + end + + def scan_byte + @scanner.scan_byte + end + # @return true if the Source is exhausted def empty? @scanner.eos? From a4bf93a65e03c6bf26c688a8a616ad135f89244f Mon Sep 17 00:00:00 2001 From: OlofKalufs Date: Mon, 20 Jan 2025 15:38:08 +0100 Subject: [PATCH 15/45] Added rdoc as a development dependency (for Ruby 3.5+) (#235) Ruby 3.5+ requires that rdoc explicitly be declared as a dependency Should sort out GitHub Actions that are failing due to this --- Gemfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Gemfile b/Gemfile index 1710ec99..d323e2c5 100644 --- a/Gemfile +++ b/Gemfile @@ -16,6 +16,7 @@ group :development do # depends on "ostruct" explicitly. gem "ostruct" gem "rake" + gem "rdoc" end group :benchmark do From 107e273337b2e2160d6b0b15e10d0a9da0b9e164 Mon Sep 17 00:00:00 2001 From: OlofKalufs Date: Mon, 20 Jan 2025 23:13:00 +0100 Subject: [PATCH 16/45] Fix serialization of ATTLIST is incorrect (#234) GitHub: fix #233 Changed so that " EOL + assert_equal '', doc.doctype.children[0].to_s.gsub(/\s+/, " ") assert_equal 'gobble', doc.root.attributes['bar'] assert_equal 'xxx', doc.root.elements[2].namespace assert_equal 'two', doc.root.elements[1].namespace From f63c510287d29c2d6261ad94a641cb93f731be4a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Fri, 24 Jan 2025 09:55:57 +0900 Subject: [PATCH 17/45] Changed benchmark target to Ruby latest (#236) Ruby 3.4 has been released, we will change our benchmark target to Ruby latest(3.4). Co-authored-by: Sutou Kouhei --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 52349b44..2c638b03 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -11,7 +11,7 @@ jobs: fail-fast: false matrix: ruby-version: - - '3.3' + - 'ruby' runs-on: - ubuntu-latest runs-on: ${{ matrix.runs-on }} From 67d21be36c87d23b7a00c4f50017d9db977319d2 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 26 Jan 2025 19:56:59 +0900 Subject: [PATCH 18/45] Reduced regular expression processing in the form of processing white space first (#237) ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.4.1/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +PRISM [arm64-darwin24] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 19.849 20.109 36.064 38.655 i/s - 100.000 times in 5.038102s 4.972864s 2.772838s 2.586981s sax 30.339 30.449 52.946 54.873 i/s - 100.000 times in 3.296102s 3.284176s 1.888722s 1.822391s pull 34.785 34.916 65.808 65.219 i/s - 100.000 times in 2.874810s 2.863976s 1.519581s 1.533305s stream 34.766 34.921 61.920 63.277 i/s - 100.000 times in 2.876359s 2.863571s 1.615000s 1.580354s Comparison: dom after(YJIT): 38.7 i/s before(YJIT): 36.1 i/s - 1.07x slower after: 20.1 i/s - 1.92x slower before: 19.8 i/s - 1.95x slower sax after(YJIT): 54.9 i/s before(YJIT): 52.9 i/s - 1.04x slower after: 30.4 i/s - 1.80x slower before: 30.3 i/s - 1.81x slower pull before(YJIT): 65.8 i/s after(YJIT): 65.2 i/s - 1.01x slower after: 34.9 i/s - 1.88x slower before: 34.8 i/s - 1.89x slower stream after(YJIT): 63.3 i/s before(YJIT): 61.9 i/s - 1.02x slower after: 34.9 i/s - 1.81x slower before: 34.8 i/s - 1.82x slower ``` - YJIT=ON : 0.99x - 1.07x faster - YJIT=OFF : 1.00x - 1.01x faster --- lib/rexml/parsers/baseparser.rb | 13 ++++++++----- test/parse/test_document_type_declaration.rb | 10 +++++----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 87f50f09..44aacfa2 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -297,10 +297,11 @@ def pull_event raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) - if @source.match?(/\s*\[/um, true) + @source.match?(/\s*/um, true) # skip spaces + if @source.match?("[", true) id = [nil, nil, nil] @document_status = :in_doctype - elsif @source.match?(/\s*>/um, true) + elsif @source.match?(">", true) id = [nil, nil, nil] @document_status = :after_doctype @source.ensure_buffer @@ -312,9 +313,10 @@ def pull_event # For backward compatibility id[1], id[2] = id[2], nil end - if @source.match?(/\s*\[/um, true) + @source.match?(/\s*/um, true) # skip spaces + if @source.match?("[", true) @document_status = :in_doctype - elsif @source.match?(/\s*>/um, true) + elsif @source.match?(">", true) @document_status = :after_doctype @source.ensure_buffer else @@ -409,7 +411,8 @@ def pull_event id = parse_id(base_error_message, accept_external_id: true, accept_public_id: true) - unless @source.match?(/\s*>/um, true) + @source.match?(/\s*/um, true) # skip spaces + unless @source.match?(">", true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 99c23745..b22863a9 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -153,7 +153,7 @@ def test_no_literal Line: 3 Position: 26 Last 80 unconsumed characters: - SYSTEM> +SYSTEM> DETAIL end @@ -200,7 +200,7 @@ def test_content_double_quote Line: 3 Position: 62 Last 80 unconsumed characters: - PUBLIC 'double quote " is invalid' "r.dtd"> +PUBLIC 'double quote " is invalid' "r.dtd"> DETAIL end @@ -228,10 +228,10 @@ def test_garbage_after_literal end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed DOCTYPE: garbage after external ID -Line: 3 -Position: 65 +Line: 1 +Position: 58 Last 80 unconsumed characters: -x'> +x'> DETAIL end From bfb37e9ca4cb974c9bb2dc2f06e1202719d1bc4d Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 16 Feb 2025 10:57:37 +0900 Subject: [PATCH 19/45] Add 3.4.1 entry (#239) --- NEWS.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/NEWS.md b/NEWS.md index f25a33f2..51a45cab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,29 @@ # News +## 3.4.1 - 2025-02-16 {#version-3-4-1} + +### Improvement + + * Improved performance. + * GH-226 + * GH-227 + * GH-237 + * Patch by NAITOH Jun + +### Fixes + + * Fix serialization of ATTLIST is incorrect + * GH-233 + * GH-234 + * Patch by OlofKalufs + * Reported by OlofKalufs + +### Thanks + + * NAITOH Jun + + * OlofKalufs + ## 3.4.0 - 2024-12-15 {#version-3-4-0} ### Improvement From b97e454ceb2e1719a487bfebaae3da4a706a854b Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 16 Feb 2025 16:48:06 +0900 Subject: [PATCH 20/45] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index a653f028..bf3c0d32 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.4.1" + VERSION = "3.4.2" REVISION = "" Copyright = COPYRIGHT From 64a709e74551d5968f2241a772876f4b0c8dea22 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 2 Mar 2025 11:38:54 +0900 Subject: [PATCH 21/45] Improve CDATA parse performance (#244) ## Why? GitHub: fix #243 ## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/parse_cdata.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) dom 648.361 1.178k 591.590 1.046k i/s - 100.000 times in 0.154235s 0.084913s 0.169036s 0.095627s sax 699.061 1.378k 651.148 1.196k i/s - 100.000 times in 0.143049s 0.072549s 0.153575s 0.083611s pull 699.271 1.379k 660.275 1.210k i/s - 100.000 times in 0.143006s 0.072527s 0.151452s 0.082622s stream 701.725 1.383k 659.483 1.228k i/s - 100.000 times in 0.142506s 0.072307s 0.151634s 0.081455s Comparison: dom master: 1177.7 i/s master(YJIT): 1045.7 i/s - 1.13x slower rexml 3.4.1: 648.4 i/s - 1.82x slower 3.4.1(YJIT): 591.6 i/s - 1.99x slower sax master: 1378.4 i/s master(YJIT): 1196.0 i/s - 1.15x slower rexml 3.4.1: 699.1 i/s - 1.97x slower 3.4.1(YJIT): 651.1 i/s - 2.12x slower pull master: 1378.8 i/s master(YJIT): 1210.3 i/s - 1.14x slower rexml 3.4.1: 699.3 i/s - 1.97x slower 3.4.1(YJIT): 660.3 i/s - 2.09x slower stream master: 1383.0 i/s master(YJIT): 1227.7 i/s - 1.13x slower rexml 3.4.1: 701.7 i/s - 1.97x slower 3.4.1(YJIT): 659.5 i/s - 2.10x slower ``` - YJIT=ON : 1.76x - 1.83x faster - YJIT=OFF : 1.82x - 1.97x faster Reported by Masamune. Thanks!!! Co-authored-by: Sutou Kouhei --- benchmark/parse_cdata.yaml | 50 +++++++++++++++++++++++++++++++++ lib/rexml/parsers/baseparser.rb | 10 +++++-- lib/rexml/source.rb | 2 +- test/parse/test_cdata.rb | 20 ++++++++++++- 4 files changed, 77 insertions(+), 5 deletions(-) create mode 100644 benchmark/parse_cdata.yaml diff --git a/benchmark/parse_cdata.yaml b/benchmark/parse_cdata.yaml new file mode 100644 index 00000000..cde04306 --- /dev/null +++ b/benchmark/parse_cdata.yaml @@ -0,0 +1,50 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + require 'rexml/parsers/sax2parser' + require 'rexml/parsers/pullparser' + require 'rexml/parsers/streamparser' + require 'rexml/streamlistener' + + def build_xml(size) + xml = "\n" + + "Test\n" + + "\n" + end + xml = build_xml(100000) + + class Listener + include REXML::StreamListener + end + +benchmark: + 'dom' : REXML::Document.new(xml) + 'sax' : REXML::Parsers::SAX2Parser.new(xml).parse + 'pull' : | + parser = REXML::Parsers::PullParser.new(xml) + while parser.has_next? + parser.pull + end + 'stream' : REXML::Parsers::StreamParser.new(xml, Listener.new).parse diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 44aacfa2..e666c2af 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -471,9 +471,13 @@ def pull_event end return [ :comment, md[1] ] - else - md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) - return [ :cdata, md[1] ] if md + elsif @source.match?("[CDATA[", true) + text = @source.read_until("]]>") + if text.chomp!("]]>") + return [ :cdata, text ] + else + raise REXML::ParseException.new("Malformed CDATA: Missing end ']]>'", @source) + end end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 5ba5ab12..3ec1141e 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -67,7 +67,7 @@ class Source module Private SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} - pre_defined_terms = ["'", '"', "<"] + pre_defined_terms = ["'", '"', "<", "]]>"] if StringScanner::Version < "3.1.1" pre_defined_terms.each do |term| PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb index b5f1a3bc..c742d6a1 100644 --- a/test/parse/test_cdata.rb +++ b/test/parse/test_cdata.rb @@ -7,10 +7,28 @@ module REXMLTests class TestParseCData < Test::Unit::TestCase include Test::Unit::CoreAssertions + def parse(xml) + REXML::Document.new(xml) + end + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('" * n + ' ]]>') + parse('" * n + ' ]]>') + end + end + + class TestInvalid < self + def test_unclosed_cdata + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed CDATA: Missing end ']]>' + Line: 1 + Position: 25 + Last 80 unconsumed characters: + DETAIL end end end From 434909171ef3756c1ca2b84f5c90923a72c6a591 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 3 Mar 2025 13:47:31 +0900 Subject: [PATCH 22/45] Improve comment parse performance (#245) ## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/parse_comment.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) top_level 999.440 5.058k 922.416 3.340k i/s - 100.000 times in 0.100056s 0.019770s 0.108411s 0.029936s in_doctype 1.063k 4.890k 980.498 3.341k i/s - 100.000 times in 0.094116s 0.020449s 0.101989s 0.029927s after_doctype 638.321 1.304k 603.952 1.153k i/s - 100.000 times in 0.156661s 0.076710s 0.165576s 0.086748s Comparison: top_level master: 5058.2 i/s master(YJIT): 3340.5 i/s - 1.51x slower rexml 3.4.1: 999.4 i/s - 5.06x slower 3.4.1(YJIT): 922.4 i/s - 5.48x slower in_doctype master: 4890.2 i/s master(YJIT): 3341.5 i/s - 1.46x slower rexml 3.4.1: 1062.5 i/s - 4.60x slower 3.4.1(YJIT): 980.5 i/s - 4.99x slower after_doctype master: 1303.6 i/s master(YJIT): 1152.8 i/s - 1.13x slower rexml 3.4.1: 638.3 i/s - 2.04x slower 3.4.1(YJIT): 604.0 i/s - 2.16x slower ``` - YJIT=ON : 1.90x - 3.62x faster - YJIT=OFF : 2.04x - 5.06x faster --- benchmark/parse_comment.yaml | 36 ++++++++++++++++++++++++++++++ lib/rexml/parsers/baseparser.rb | 39 ++++++++++++++------------------- test/parse/test_comment.rb | 21 +++++++++++++----- 3 files changed, 69 insertions(+), 27 deletions(-) create mode 100644 benchmark/parse_comment.yaml diff --git a/benchmark/parse_comment.yaml b/benchmark/parse_comment.yaml new file mode 100644 index 00000000..a0a3a771 --- /dev/null +++ b/benchmark/parse_comment.yaml @@ -0,0 +1,36 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + SIZE = 100000 + + top_level_xml = "\n" + in_doctype_xml = "]>" + after_doctype_xml = "" + +benchmark: + 'top_level' : REXML::Document.new(top_level_xml) + 'in_doctype' : REXML::Document.new(in_doctype_xml) + 'after_doctype' : REXML::Document.new(after_doctype_xml) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index e666c2af..61d38ae2 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -277,14 +277,7 @@ def pull_event return process_instruction elsif @source.match?("/um, true) - if md.nil? - raise REXML::ParseException.new("Unclosed comment", @source) - end - if /--|-\z/.match?(md[1]) - raise REXML::ParseException.new("Malformed comment", @source) - end - return [ :comment, md[1] ] + return [ :comment, process_comment ] elsif @source.match?("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" unless @source.match?(/\s+/um, true) @@ -417,12 +410,8 @@ def pull_event raise REXML::ParseException.new(message, @source) end return [:notationdecl, name, *id] - elsif md = @source.match(/--(.*?)-->/um, true) - case md[1] - when /--/, /-\z/ - raise REXML::ParseException.new("Malformed comment", @source) - end - return [ :comment, md[1] ] if md + elsif @source.match?("--", true) + return [ :comment, process_comment ] end elsif match = @source.match(/(%.*?;)\s*/um, true) return [ :externalentity, match[1] ] @@ -463,14 +452,8 @@ def pull_event md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][0] == ?- - md = @source.match(/--(.*?)-->/um, true) - - if md.nil? || /--|-\z/.match?(md[1]) - raise REXML::ParseException.new("Malformed comment", @source) - end - - return [ :comment, md[1] ] + if @source.match?("--", true) + return [ :comment, process_comment ] elsif @source.match?("[CDATA[", true) text = @source.read_until("]]>") if text.chomp!("]]>") @@ -738,6 +721,18 @@ def parse_id_invalid_details(accept_external_id:, end end + def process_comment + text = @source.read_until("-->") + unless text.chomp!("-->") + raise REXML::ParseException.new("Unclosed comment: Missing end '-->'", @source) + end + + if text.include? "--" or text.end_with?("-") + raise REXML::ParseException.new("Malformed comment", @source) + end + text + end + def process_instruction name = parse_name("Malformed XML: Invalid processing instruction node") if @source.match?(/\s+/um, true) diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 4475dca7..c573e711 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -17,7 +17,7 @@ def test_toplevel_unclosed_comment parse("' Line: 1 Position: 4 Last 80 unconsumed characters: @@ -48,6 +48,18 @@ def test_toplevel_malformed_comment_end DETAIL end + def test_doctype_unclosed_comment + exception = assert_raise(REXML::ParseException) do + parse("' + Line: 1 + Position: 19 + Last 80 unconsumed characters: + DETAIL + end + def test_doctype_malformed_comment_inner exception = assert_raise(REXML::ParseException) do parse("") @@ -72,16 +84,15 @@ def test_doctype_malformed_comment_end DETAIL end - def test_after_doctype_malformed_comment_short + def test_after_doctype_unclosed_comment exception = assert_raise(REXML::ParseException) do parse("") end - assert_equal(<<~DETAIL.chomp, exception.to_s) - Malformed comment + assert_equal(<<~DETAIL, exception.to_s) + Unclosed comment: Missing end '-->' Line: 1 Position: 8 Last 80 unconsumed characters: - --> DETAIL end From a5f31c49be106011c4d96cb0e308ebbba118d192 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 5 Mar 2025 06:20:42 +0900 Subject: [PATCH 23/45] Improve CDATA and comment parse performance (#246) ## Why? Since `` are malformed node, they do not need to be checked before comments and CDATA. ## Benchmark : comment (after_doctype) ``` $ benchmark-driver benchmark/parse_comment.yaml Calculating ------------------------------------- before after before(YJIT) after(YJIT) after_doctype 1.306k 5.586k 1.152k 3.569k i/s - 100.000 times in 0.076563s 0.017903s 0.086822s 0.028020s Comparison: after_doctype after: 5585.7 i/s after(YJIT): 3568.9 i/s - 1.57x slower before: 1306.1 i/s - 4.28x slower before(YJIT): 1151.8 i/s - 4.85x slower ``` - YJIT=ON : 3.09x faster - YJIT=OFF : 4.28x faster ## Benchmark : CDATA ``` $ benchmark-driver benchmark/parse_cdata.yaml Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 1.269k 5.548k 1.053k 3.072k i/s - 100.000 times in 0.078808s 0.018026s 0.094976s 0.032553s sax 1.399k 8.244k 1.220k 4.460k i/s - 100.000 times in 0.071458s 0.012130s 0.081958s 0.022422s pull 1.411k 8.319k 1.260k 4.806k i/s - 100.000 times in 0.070883s 0.012021s 0.079335s 0.020809s stream 1.420k 8.320k 1.254k 4.728k i/s - 100.000 times in 0.070406s 0.012019s 0.079738s 0.021149s Comparison: dom after: 5547.5 i/s after(YJIT): 3071.9 i/s - 1.81x slower before: 1268.9 i/s - 4.37x slower before(YJIT): 1052.9 i/s - 5.27x slower sax after: 8244.0 i/s after(YJIT): 4459.9 i/s - 1.85x slower before: 1399.4 i/s - 5.89x slower before(YJIT): 1220.1 i/s - 6.76x slower pull after: 8318.8 i/s after(YJIT): 4805.6 i/s - 1.73x slower before: 1410.8 i/s - 5.90x slower before(YJIT): 1260.5 i/s - 6.60x slower stream after: 8320.2 i/s after(YJIT): 4728.4 i/s - 1.76x slower before: 1420.3 i/s - 5.86x slower before(YJIT): 1254.1 i/s - 6.63x slower ``` - YJIT=ON : 2.91x - 3.80x faster - YJIT=OFF : 4.37x - 5.90x faster Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 6 ++---- test/parse/test_comment.rb | 13 +++++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 61d38ae2..de85aebd 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -449,9 +449,7 @@ def pull_event end return [ :end_element, last_tag ] elsif @source.match?("!", true) - md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" - raise REXML::ParseException.new("Malformed node", @source) unless md if @source.match?("--", true) return [ :comment, process_comment ] elsif @source.match?("[CDATA[", true) @@ -461,9 +459,9 @@ def pull_event else raise REXML::ParseException.new("Malformed CDATA: Missing end ']]>'", @source) end + else + raise REXML::ParseException.new("Malformed node: Started with '") From a85203e88c8f50f64140fb50492cf9dbe3d79301 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 5 Mar 2025 09:45:19 +0900 Subject: [PATCH 24/45] Raise appropriate exception when failing to match start tag in DOCTYPE (#247) ## Why? Added exception to make the process easier to understand. --- lib/rexml/parsers/baseparser.rb | 5 +++-- test/parse/test_comment.rb | 13 +++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index de85aebd..750b1697 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -412,14 +412,15 @@ def pull_event return [:notationdecl, name, *id] elsif @source.match?("--", true) return [ :comment, process_comment ] + else + raise REXML::ParseException.new("Malformed node: Started with '/um, true) @document_status = :after_doctype return [ :end_doctype ] - end - if @document_status == :in_doctype + else raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source) end end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 5349c18e..6339835d 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -48,6 +48,19 @@ def test_toplevel_malformed_comment_end DETAIL end + def test_doctype_malformed_node + exception = assert_raise(REXML::ParseException) do + parse(" Date: Thu, 3 Apr 2025 03:45:35 -0400 Subject: [PATCH 25/45] Fix docs typo in code example (#248) --- lib/rexml/document.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index d1747dd4..1960012c 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -309,8 +309,8 @@ def stand_alone? end # :call-seq: - # doc.write(output=$stdout, indent=-1, transtive=false, ie_hack=false, encoding=nil) - # doc.write(options={:output => $stdout, :indent => -1, :transtive => false, :ie_hack => false, :encoding => nil}) + # doc.write(output=$stdout, indent=-1, transitive=false, ie_hack=false, encoding=nil) + # doc.write(options={:output => $stdout, :indent => -1, :transitive => false, :ie_hack => false, :encoding => nil}) # # Write the XML tree out, optionally with indent. This writes out the # entire XML document, including XML declarations, doctype declarations, From d944fa478a972febe9c3ad2cf35232223d391597 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 3 May 2025 09:03:12 +0900 Subject: [PATCH 26/45] NEWS.md : Fix the mentioned of the PR in CVE-2024-35176. (#253) I think the mentioned of CVE-2024-35176 in NEWS.md is incorrect. ``` - Improved parse performance when an attribute has many ' characters. --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 51a45cab..7f95d829 100644 --- a/NEWS.md +++ b/NEWS.md @@ -386,7 +386,7 @@ * Patch by NAITOH Jun. - * Improved parse performance when an attribute has many `<`s. + * Improved parse performance when an attribute has many `>`s. * GH-126 From de6f40ed8749dd6ab4b7c4b80494a824f7f9027a Mon Sep 17 00:00:00 2001 From: tomoya ishida Date: Sat, 3 May 2025 09:21:27 +0900 Subject: [PATCH 27/45] Fix reverse sort in xpath_parser (#251) The code below was failing with `REXML::XPathParser#sort': undefined method '-@' for an instance of Array` ```ruby d = REXML::Document.new("") matches = REXML::XPath.match(d, "a/b/x/preceding-sibling::node()") # Before: error # After: [, , ] ``` This pull request will fix it. --- lib/rexml/xpath_parser.rb | 2 +- test/xpath/test_base.rb | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 5eb1e5a9..f86a87e6 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -671,7 +671,7 @@ def sort(array_of_nodes, order) if order == :forward index else - -index + index.map(&:-@) end end ordered.collect do |_index, node| diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 1dacd69d..53264a9e 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -416,6 +416,12 @@ def test_preceding assert_equal( 4, cs.length ) end + def test_preceding_sibling + d = REXML::Document.new("") + matches = REXML::XPath.match(d, "a/b/x/preceding-sibling::node()") + assert_equal(["e", "d", "c"], matches.map(&:name)) + end + def test_following d = Document.new "" start = XPath.first( d, "/a/b[@id='0']" ) From 249d770b4ead129abf475708e84e3f1f7908962a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 6 May 2025 21:33:00 +0900 Subject: [PATCH 28/45] Fix duplicate responses in XPath following, following-sibling, preceding, preceding-sibling (#255) ## Why? See: https://github.com/ruby/rexml/pull/251#issuecomment-2845103143 ## Expected values - XPath : a/d/preceding::* => ["d", "c", "b"] ```xml ``` - XPath : a/d/following::* => ["d", "e", "f"] ```xml ``` - XPath : a/b/x/following-sibling:* => ["c", "d", "e"] ```xml ``` - XPath : a/b/x/following-sibling:* => ["c", "d", "x", "e"] ```xml ``` - XPath : a/b/x/preceding-sibling::* => ["e", "d", "c"] ```xml ``` - XPath : a/b/x/preceding-sibling::* => ["e", "x", "d", "c"] ```xml ``` - XPath : //a/following-sibling:*[1] => ["w", "x", "y", "z"] ```xml ``` --- lib/rexml/xpath_parser.rb | 2 +- test/xpath/test_base.rb | 97 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 95 insertions(+), 4 deletions(-) diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index f86a87e6..cde2e5d5 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -144,7 +144,7 @@ def match(path_stack, nodeset) result = expr(path_stack, nodeset) case result when Array # nodeset - unnode(result) + unnode(result).uniq else [result] end diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 53264a9e..b923eed2 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -416,12 +416,103 @@ def test_preceding assert_equal( 4, cs.length ) end - def test_preceding_sibling - d = REXML::Document.new("") - matches = REXML::XPath.match(d, "a/b/x/preceding-sibling::node()") + def test_preceding_multiple + source = <<-XML + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/d/preceding::*") + assert_equal(["d", "c", "b"], matches.map(&:name)) + end + + def test_following_multiple + source = <<-XML + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/d/following::*") + assert_equal(["d", "e", "f"], matches.map(&:name)) + end + + def test_following_sibling_across_multiple_nodes + source = <<-XML + + + + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/following-sibling::*") + assert_equal(["c", "d", "e"], matches.map(&:name)) + end + + def test_following_sibling_within_single_node + source = <<-XML + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/following-sibling::*") + assert_equal(["c", "d", "x", "e"], matches.map(&:name)) + end + + def test_following_sibling_predicates + source = <<-XML + + XML + doc = REXML::Document.new(source) + # Finds a node flowing + matches = REXML::XPath.match(doc, "//a/following-sibling::*[1]") + assert_equal(["w", "x", "y", "z"], matches.map(&:name)) + end + + def test_preceding_sibling_across_multiple_nodes + source = <<-XML + + + + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/preceding-sibling::*") assert_equal(["e", "d", "c"], matches.map(&:name)) end + def test_preceding_sibling_within_single_node + source = <<-XML + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/preceding-sibling::*") + assert_equal(["e", "x", "d", "c"], matches.map(&:name)) + end + def test_following d = Document.new "" start = XPath.first( d, "/a/b[@id='0']" ) From cd575a10cac58eb47f235ed186060ac65ffb5284 Mon Sep 17 00:00:00 2001 From: tomoya ishida Date: Wed, 7 May 2025 21:02:31 +0900 Subject: [PATCH 29/45] Deprecate accepting array as an element in XPath.match, first and each (#252) `XPath.match`, `XPath.first`, `XPath.each`, `XPathParser#parse` and `XPathParser#match` accepted nodeset as element. This pull request changes the first parameter of these method to be an element instead of nodeset. Passing nodeset will be deprecated. ```ruby # Documented usage. OK REXML::XPath.match(element, xpath) # Undocumented usage. Deprecate in this pull request nodeset = [element] REXML::XPath.match(nodeset, xpath) ``` ### Background #249 will introduce a temporary cache. ```ruby def parse path, nodeset path_stack = @parser.parse( path ) nodeset.first.document.send(:enable_cache) do match( path_stack, nodeset ) end end ``` But the signature `XPathParser#match(path, nodeset)` does not guarantee that all nodes in the nodeset has the same root document. So cache does not work in the code below. It's still slow. ```ruby REXML::XPath.match(2.times.map { REXML::Document.new(''*400+''*400) }, 'a//a') ``` The interface is holding our back, so I propose to drop accepting array as element. This change is a backward incompatibility, but it just drops undocumented feature. I think only the test code was unintentionally using this feature. ### XPath.match with array XPath.match only traverse the first element of the array for some selectors. ```ruby nodeset = [REXML::Document.new(""), REXML::Document.new("")] REXML::XPath.match(nodeset, "a/*") #=> [, ] REXML::XPath.match(nodeset, "//a/*") #=> [] # I expect [, ] but the second document is ignored ``` It indicates that `XPath.match` is not designed to search inside multiple nodes/documents. --------- Co-authored-by: Sutou Kouhei --- lib/rexml/xpath.rb | 3 --- lib/rexml/xpath_parser.rb | 22 ++++++++++++---------- test/test_jaxen.rb | 16 ++++++++++------ test/xpath/test_base.rb | 17 ++++++++++++++--- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/lib/rexml/xpath.rb b/lib/rexml/xpath.rb index a0921bd8..666d764f 100644 --- a/lib/rexml/xpath.rb +++ b/lib/rexml/xpath.rb @@ -35,7 +35,6 @@ def XPath::first(element, path=nil, namespaces=nil, variables={}, options={}) parser.namespaces = namespaces parser.variables = variables path = "*" unless path - element = [element] unless element.kind_of? Array parser.parse(path, element).flatten[0] end @@ -64,7 +63,6 @@ def XPath::each(element, path=nil, namespaces=nil, variables={}, options={}, &bl parser.namespaces = namespaces parser.variables = variables path = "*" unless path - element = [element] unless element.kind_of? Array parser.parse(path, element).each( &block ) end @@ -74,7 +72,6 @@ def XPath::match(element, path=nil, namespaces=nil, variables={}, options={}) parser.namespaces = namespaces parser.variables = variables path = "*" unless path - element = [element] unless element.kind_of? Array parser.parse(path,element) end end diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index cde2e5d5..8440015b 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -76,19 +76,19 @@ def variables=( vars={} ) @variables = vars end - def parse path, nodeset + def parse path, node path_stack = @parser.parse( path ) - match( path_stack, nodeset ) + match( path_stack, node ) end - def get_first path, nodeset + def get_first path, node path_stack = @parser.parse( path ) - first( path_stack, nodeset ) + first( path_stack, node ) end - def predicate path, nodeset + def predicate path, node path_stack = @parser.parse( path ) - match( path_stack, nodeset ) + match( path_stack, node ) end def []=( variable_name, value ) @@ -136,11 +136,13 @@ def first( path_stack, node ) end - def match(path_stack, nodeset) - nodeset = nodeset.collect.with_index do |node, i| - position = i + 1 - XPathNode.new(node, position: position) + def match(path_stack, node) + if node.is_a?(Array) + Kernel.warn("REXML::XPath.each, REXML::XPath.first, REXML::XPath.match dropped support for nodeset...", uplevel: 1) + return [] if node.empty? + node = node.first end + nodeset = [XPathNode.new(node, position: 1)] result = expr(path_stack, nodeset) case result when Array # nodeset diff --git a/test/test_jaxen.rb b/test/test_jaxen.rb index 6038e88e..548120d6 100644 --- a/test/test_jaxen.rb +++ b/test/test_jaxen.rb @@ -56,7 +56,9 @@ def process_test_case(name) # processes a tests/document/context node def process_context(doc, context) - test_context = XPath.match(doc, context.attributes["select"]) + matched = XPath.match(doc, context.attributes["select"]) + assert_equal(1, matched.size) + test_context = matched.first namespaces = context.namespaces namespaces.delete("var") namespaces = nil if namespaces.empty? @@ -101,10 +103,14 @@ def process_nominal_test(context, variables, namespaces, test) assert_equal(Integer(expected, 10), matched.size, user_message(context, xpath, matched)) + else + assert_operator(matched.size, :>, 0, user_message(context, xpath, matched)) end XPath.each(test, "valueOf") do |value_of| - process_value_of(matched, variables, namespaces, value_of) + matched.each do |subcontext| + process_value_of(subcontext, variables, namespaces, value_of) + end end end @@ -118,10 +124,8 @@ def process_exceptional_test(context, variables, namespaces, test) def user_message(context, xpath, matched) message = "" - context.each_with_index do |node, i| - message << "Node#{i}:\n" - message << "#{node}\n" - end + message << "Node:\n" + message << "#{context}\n" message << "XPath: <#{xpath}>\n" message << "Matched <#{matched}>" message diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index b923eed2..ab22f6f9 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -411,9 +411,10 @@ def test_preceding s = "" d = REXML::Document.new(s) - c = REXML::XPath.match( d, "//c[@id = '5']") - cs = REXML::XPath.match( c, "preceding::c" ) - assert_equal( 4, cs.length ) + c = REXML::XPath.match(d, "//c[@id = '5']") + assert_equal(1, c.length) + cs = REXML::XPath.match(c.first, "preceding::c") + assert_equal(4, cs.length) end def test_preceding_multiple @@ -1255,5 +1256,15 @@ def test_or_and end assert_equal(["/"], hrefs, "Bug #3842 [ruby-core:32447]") end + + def test_match_with_deprecated_usage + verbose, $VERBOSE = $VERBOSE, nil + doc = Document.new("") + assert_equal(['b'], XPath.match([doc, doc], '//b').map(&:name)) + assert_equal(['b'], XPath.match([doc], '//b').map(&:name)) + assert_equal([], XPath.match([], '//b').map(&:name)) + ensure + $VERBOSE = verbose + end end end From e80ffdd12713cd138dbe33f26968452dc33d20df Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 12 May 2025 10:22:11 +0900 Subject: [PATCH 30/45] Improve using `//` in XPath performance (#249) When using `//` in XPath, the deeper the tag hierarchy, the slower it becomes due to the namespace acquisition process. Caching namespace information improves performance when using `//` with XPath. ## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/xpath.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) REXML::XPath.match(REXML::Document.new(xml), 'a//a') 29.215 234.909 108.945 492.410 i/s - 100.000 times in 3.422925s 0.425697s 0.917898s 0.203083s Comparison: REXML::XPath.match(REXML::Document.new(xml), 'a//a') master(YJIT): 492.4 i/s master: 234.9 i/s - 2.10x slower 3.4.1(YJIT): 108.9 i/s - 4.52x slower rexml 3.4.1: 29.2 i/s - 16.85x slower ``` - YJIT=ON : 4.52x faster - YJIT=OFF : 8.04x faster --------- Co-authored-by: tomoya ishida Co-authored-by: Sutou Kouhei --- benchmark/xpath.yaml | 32 ++++++++++++++++++++++++++++++++ lib/rexml/attribute.rb | 4 ++++ lib/rexml/document.rb | 14 ++++++++++++++ lib/rexml/element.rb | 33 +++++++++++++++++---------------- lib/rexml/xpath_parser.rb | 27 ++++++++++++--------------- test/test_core.rb | 23 +++++++++++++++++------ test/xpath/test_base.rb | 10 ++++++++++ 7 files changed, 106 insertions(+), 37 deletions(-) create mode 100644 benchmark/xpath.yaml diff --git a/benchmark/xpath.yaml b/benchmark/xpath.yaml new file mode 100644 index 00000000..d6e970eb --- /dev/null +++ b/benchmark/xpath.yaml @@ -0,0 +1,32 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + DEPTH = 100 + xml = '' * DEPTH + '' * DEPTH + doc = REXML::Document.new(xml) + +benchmark: + "REXML::XPath.match(REXML::Document.new(xml), 'a//a')" : REXML::XPath.match(doc, "a//a") diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index fe48745c..7a190225 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -206,6 +206,10 @@ def xpath path += "/@#{self.expanded_name}" return path end + + def document + @element&.document + end end end #vim:ts=2 sw=2 noexpandtab: diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 1960012c..1c678bef 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -448,6 +448,20 @@ def document end private + + attr_accessor :namespaces_cache + + # New document level cache is created and available in this block. + # This API is thread unsafe. Users can't change this document in this block. + def enable_cache + @namespaces_cache = {} + begin + yield + ensure + @namespaces_cache = nil + end + end + def build( source ) Parsers::TreeParser.new( source, self ).parse end diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 4e3a60b9..b62b6cc2 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -589,10 +589,12 @@ def prefixes # d.elements['//c'].namespaces # => {"x"=>"1", "y"=>"2", "z"=>"3"} # def namespaces - namespaces = {} - namespaces = parent.namespaces if parent - namespaces = namespaces.merge( attributes.namespaces ) - return namespaces + namespaces_cache = document&.__send__(:namespaces_cache) + if namespaces_cache + namespaces_cache[self] ||= calculate_namespaces + else + calculate_namespaces + end end # :call-seq: @@ -619,17 +621,9 @@ def namespace(prefix=nil) if prefix.nil? prefix = prefix() end - if prefix == '' - prefix = "xmlns" - else - prefix = "xmlns:#{prefix}" unless prefix[0,5] == 'xmlns' - end - ns = nil - target = self - while ns.nil? and target - ns = target.attributes[prefix] - target = target.parent - end + prefix = (prefix == '') ? 'xmlns' : prefix.delete_prefix("xmlns:") + ns = namespaces[prefix] + ns = '' if ns.nil? and prefix == 'xmlns' return ns end @@ -1516,8 +1510,15 @@ def write(output=$stdout, indent=-1, transitive=false, ie_hack=false) formatter.write( self, output ) end - private + def calculate_namespaces + if parent + parent.namespaces.merge(attributes.namespaces) + else + attributes.namespaces + end + end + def __to_xpath_helper node rv = node.expanded_name.clone if node.parent diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 8440015b..70ae8919 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -78,7 +78,15 @@ def variables=( vars={} ) def parse path, node path_stack = @parser.parse( path ) - match( path_stack, node ) + if node.is_a?(Array) + Kernel.warn("REXML::XPath.each, REXML::XPath.first, REXML::XPath.match dropped support for nodeset...", uplevel: 1) + return [] if node.empty? + node = node.first + end + + node.document.__send__(:enable_cache) do + match( path_stack, node ) + end end def get_first path, node @@ -137,11 +145,6 @@ def first( path_stack, node ) def match(path_stack, node) - if node.is_a?(Array) - Kernel.warn("REXML::XPath.each, REXML::XPath.first, REXML::XPath.match dropped support for nodeset...", uplevel: 1) - return [] if node.empty? - node = node.first - end nodeset = [XPathNode.new(node, position: 1)] result = expr(path_stack, nodeset) case result @@ -494,14 +497,10 @@ def node_test(path_stack, nodesets, any_type: :element) if strict? raw_node.name == name and raw_node.namespace == "" else - # FIXME: This DOUBLES the time XPath searches take - ns = get_namespace(raw_node, prefix) - raw_node.name == name and raw_node.namespace == ns + raw_node.name == name and raw_node.namespace == get_namespace(raw_node, prefix) end else - # FIXME: This DOUBLES the time XPath searches take - ns = get_namespace(raw_node, prefix) - raw_node.name == name and raw_node.namespace == ns + raw_node.name == name and raw_node.namespace == get_namespace(raw_node, prefix) end when :attribute if prefix.nil? @@ -509,9 +508,7 @@ def node_test(path_stack, nodesets, any_type: :element) elsif prefix.empty? raw_node.name == name and raw_node.namespace == "" else - # FIXME: This DOUBLES the time XPath searches take - ns = get_namespace(raw_node.element, prefix) - raw_node.name == name and raw_node.namespace == ns + raw_node.name == name and raw_node.namespace == get_namespace(raw_node.element, prefix) end else false diff --git a/test/test_core.rb b/test/test_core.rb index 34fe9e07..651056f2 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -653,18 +653,23 @@ def test_namespace assert_equal "Some text", out end - def test_add_namespace e = Element.new 'a' + assert_equal("", e.namespace) + assert_nil(e.namespace('foo')) e.add_namespace 'someuri' e.add_namespace 'foo', 'otheruri' e.add_namespace 'xmlns:bar', 'thirduri' - assert_equal 'someuri', e.attributes['xmlns'] - assert_equal 'otheruri', e.attributes['xmlns:foo'] - assert_equal 'thirduri', e.attributes['xmlns:bar'] + assert_equal("someuri", e.namespace) + assert_equal("otheruri", e.namespace('foo')) + assert_equal("otheruri", e.namespace('xmlns:foo')) + assert_equal("thirduri", e.namespace('bar')) + assert_equal("thirduri", e.namespace('xmlns:bar')) + assert_equal('someuri', e.attributes['xmlns']) + assert_equal('otheruri', e.attributes['xmlns:foo']) + assert_equal('thirduri', e.attributes['xmlns:bar']) end - def test_big_documentation d = File.open(fixture_path("documentation.xml")) {|f| Document.new f } assert_equal "Sean Russell", d.elements["documentation/head/author"].text.tr("\n\t", " ").squeeze(" ") @@ -764,9 +769,15 @@ def test_attributes_each def test_delete_namespace doc = Document.new "" + assert_equal("1", doc.root.namespace) + assert_equal("2", doc.root.namespace('x')) + assert_equal("2", doc.root.namespace('xmlns:x')) doc.root.delete_namespace doc.root.delete_namespace 'x' - assert_equal "", doc.to_s + assert_equal("", doc.to_s) + assert_equal("", doc.root.namespace) + assert_nil(doc.root.namespace('x')) + assert_nil(doc.root.namespace('xmlns:x')) end def test_each_element_with_attribute diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index ab22f6f9..764171ab 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -1193,6 +1193,16 @@ def test_namespaces_0 assert_equal( 1, XPath.match( d, "//x:*" ).size ) end + def test_namespaces_cache + doc = Document.new("") + assert_equal("", XPath.first(doc, "//b[namespace-uri()='1']").to_s) + assert_nil(XPath.first(doc, "//b[namespace-uri()='']")) + + doc.root.delete_namespace + assert_nil(XPath.first(doc, "//b[namespace-uri()='1']")) + assert_equal("", XPath.first(doc, "//b[namespace-uri()='']").to_s) + end + def test_ticket_71 doc = Document.new(%Q{}) el = doc.root.elements[1] From 3dc9eca877f8444b7ac1d6008feb724cbfdc239a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 29 May 2025 10:14:32 +0900 Subject: [PATCH 31/45] Improve `Text.check` performance (#256) The doctype parameter of Text.check is not being used. Changing the doctype parameter to an optional parameter improves the parsing speed of the DOM. ## Benchmark ``` before after before(YJIT) after(YJIT) dom 19.854 23.805 33.969 37.712 i/s - 100.000 times in 5.036779s 4.200839s 2.943877s 2.651709s sax 29.436 30.494 54.070 55.089 i/s - 100.000 times in 3.397155s 3.279348s 1.849463s 1.815255s pull 34.908 34.857 62.969 64.895 i/s - 100.000 times in 2.864651s 2.868842s 1.588082s 1.540939s stream 34.570 34.281 60.616 60.355 i/s - 100.000 times in 2.892656s 2.917080s 1.649737s 1.656866s Comparison: dom after(YJIT): 37.7 i/s before(YJIT): 34.0 i/s - 1.11x slower after: 23.8 i/s - 1.58x slower before: 19.9 i/s - 1.90x slower sax after(YJIT): 55.1 i/s before(YJIT): 54.1 i/s - 1.02x slower after: 30.5 i/s - 1.81x slower before: 29.4 i/s - 1.87x slower pull after(YJIT): 64.9 i/s before(YJIT): 63.0 i/s - 1.03x slower before: 34.9 i/s - 1.86x slower after: 34.9 i/s - 1.86x slower stream before(YJIT): 60.6 i/s after(YJIT): 60.4 i/s - 1.00x slower before: 34.6 i/s - 1.75x slower after: 34.3 i/s - 1.77x slower ``` - YJIT=ON : 1.00x - 1.11x faster (dom: 1.11x faster) - YJIT=OFF : 1.00x - 1.20x faster (dom: 1.20x faster) --- lib/rexml/attribute.rb | 2 +- lib/rexml/text.rb | 6 +++--- test/test_text_check.rb | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 7a190225..ba49207c 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -173,7 +173,7 @@ def element=( element ) @element = element if @normalized - Text.check( @normalized, NEEDS_A_SECOND_CHECK, doctype ) + Text.check( @normalized, NEEDS_A_SECOND_CHECK ) end self diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 2bf480fb..6f821472 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -104,16 +104,16 @@ def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, @entity_filter = entity_filter if entity_filter clear_cache - Text.check(@string, illegal, doctype) if @raw + Text.check(@string, illegal) if @raw end def parent= parent super(parent) - Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent + Text.check(@string, NEEDS_A_SECOND_CHECK) if @raw and @parent end # check for illegal characters - def Text.check string, pattern, doctype + def Text.check string, pattern, doctype = nil # illegal anywhere if !string.match?(VALID_XML_CHARS) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index 11cf65a3..3f2f7864 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -4,7 +4,7 @@ module REXMLTests class TextCheckTester < Test::Unit::TestCase def check(string) - REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK, nil) + REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK) end def assert_check(string) From 95b8ef8d8549eb98763477e6e5307bf97c1dc4c5 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 7 Jul 2025 14:16:56 +0900 Subject: [PATCH 32/45] Fix wrong Encoding resolution (#258) In this context, `Encoding` means `REXML::Encoding` not `Encoding`. --- lib/rexml/encoding.rb | 2 +- test/test_source.rb | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 test/test_source.rb diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index da2d70d6..f8459316 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -5,7 +5,7 @@ module Encoding # ID ---> Encoding name attr_reader :encoding def encoding=(encoding) - encoding = encoding.name if encoding.is_a?(Encoding) + encoding = encoding.name if encoding.is_a?(::Encoding) if encoding.is_a?(String) original_encoding = encoding encoding = find_encoding(encoding) diff --git a/test/test_source.rb b/test/test_source.rb new file mode 100644 index 00000000..b309105a --- /dev/null +++ b/test/test_source.rb @@ -0,0 +1,21 @@ +require "rexml/source" + +module REXMLTests + class TestSource < Test::Unit::TestCase + def setup + @source = REXML::Source.new(+"") + end + + sub_test_case("#encoding=") do + test("String") do + @source.encoding = "UTF-8" + assert_equal("UTF-8", @source.encoding) + end + + test("Encoding") do + @source.encoding = Encoding::UTF_8 + assert_equal("UTF-8", @source.encoding) + end + end + end +end From 548172637b8eb106ea38f3b91f54d0fc2e6e8e08 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 9 Jul 2025 06:14:08 +0900 Subject: [PATCH 33/45] Don't call needless encoding_updated (#259) Needless encoding_updated call may have performance penalty a bit. --- lib/rexml/encoding.rb | 7 ++----- test/test_source.rb | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index f8459316..7eb05f4d 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -13,12 +13,9 @@ def encoding=(encoding) raise ArgumentError, "Bad encoding name #{original_encoding}" end end + encoding = encoding.upcase if encoding return false if defined?(@encoding) and encoding == @encoding - if encoding - @encoding = encoding.upcase - else - @encoding = 'UTF-8' - end + @encoding = encoding || "UTF-8" true end diff --git a/test/test_source.rb b/test/test_source.rb index b309105a..86755f37 100644 --- a/test/test_source.rb +++ b/test/test_source.rb @@ -12,6 +12,21 @@ def setup assert_equal("UTF-8", @source.encoding) end + test("encoding_updated") do + def @source.n_encoding_updated_called + @n_encoding_updated_called + end + def @source.encoding_updated + super + @n_encoding_updated_called ||= 0 + @n_encoding_updated_called += 1 + end + @source.encoding = "shift-jis" + assert_equal(1, @source.n_encoding_updated_called) + @source.encoding = "Shift-JIS" + assert_equal(1, @source.n_encoding_updated_called) + end + test("Encoding") do @source.encoding = Encoding::UTF_8 assert_equal("UTF-8", @source.encoding) From ec410a0d5e5e5daddca82fd1455824219403f676 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 07:06:45 +0700 Subject: [PATCH 34/45] Reuse XPath.match (#263) `XPath.each` and `XPath.first` can reuse `XPath.match`. --- lib/rexml/xpath.rb | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/lib/rexml/xpath.rb b/lib/rexml/xpath.rb index 666d764f..eed0300c 100644 --- a/lib/rexml/xpath.rb +++ b/lib/rexml/xpath.rb @@ -31,11 +31,7 @@ class XPath def XPath::first(element, path=nil, namespaces=nil, variables={}, options={}) raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.nil? or namespaces.kind_of?(Hash) raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of?(Hash) - parser = XPathParser.new(**options) - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - parser.parse(path, element).flatten[0] + match(element, path, namespaces, variables, options).flatten[0] end # Iterates over nodes that match the given path, calling the supplied @@ -59,11 +55,7 @@ def XPath::first(element, path=nil, namespaces=nil, variables={}, options={}) def XPath::each(element, path=nil, namespaces=nil, variables={}, options={}, &block) raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.nil? or namespaces.kind_of?(Hash) raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of?(Hash) - parser = XPathParser.new(**options) - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - parser.parse(path, element).each( &block ) + match(element, path, namespaces, variables, options).each( &block ) end # Returns an array of nodes matching a given XPath. From 2271fd374403bcdfb0b9f288cc0d97c92af9d886 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 07:18:20 +0700 Subject: [PATCH 35/45] docs: Use # to reference instance methods (#270) Fixes #269 We should use `XXX#method` not `XXX.method` to reference instance methods. --- lib/rexml/cdata.rb | 2 +- lib/rexml/comment.rb | 2 +- lib/rexml/element.rb | 2 +- lib/rexml/instruction.rb | 2 +- lib/rexml/node.rb | 2 +- lib/rexml/text.rb | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/rexml/cdata.rb b/lib/rexml/cdata.rb index 997f5a08..264ad642 100644 --- a/lib/rexml/cdata.rb +++ b/lib/rexml/cdata.rb @@ -58,7 +58,7 @@ def value # c = CData.new( " Some text " ) # c.write( $stdout ) #-> def write( output=$stdout, indent=-1, transitive=false, ie_hack=false ) - Kernel.warn( "#{self.class.name}.write is deprecated", uplevel: 1) + Kernel.warn( "#{self.class.name}#write is deprecated", uplevel: 1) indent( output, indent ) output << START output << @string diff --git a/lib/rexml/comment.rb b/lib/rexml/comment.rb index 52c58b46..e7e104d4 100644 --- a/lib/rexml/comment.rb +++ b/lib/rexml/comment.rb @@ -48,7 +48,7 @@ def clone # ie_hack:: # Needed for conformity to the child API, but not used by this class. def write( output, indent=-1, transitive=false, ie_hack=false ) - Kernel.warn("Comment.write is deprecated. See REXML::Formatters", uplevel: 1) + Kernel.warn("#{self.class.name}#write is deprecated. See REXML::Formatters", uplevel: 1) indent( output, indent ) output << START output << @string diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index b62b6cc2..4311d58f 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -1496,7 +1496,7 @@ def texts # doc.write( out ) #-> doc is written to the string 'out' # doc.write( $stdout ) #-> doc written to the console def write(output=$stdout, indent=-1, transitive=false, ie_hack=false) - Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters", uplevel: 1) + Kernel.warn("#{self.class.name}#write is deprecated. See REXML::Formatters", uplevel: 1) formatter = if indent > -1 if transitive require_relative "formatters/transitive" diff --git a/lib/rexml/instruction.rb b/lib/rexml/instruction.rb index 318741f0..a3dfbbec 100644 --- a/lib/rexml/instruction.rb +++ b/lib/rexml/instruction.rb @@ -49,7 +49,7 @@ def clone # See the rexml/formatters package # def write writer, indent=-1, transitive=false, ie_hack=false - Kernel.warn( "#{self.class.name}.write is deprecated", uplevel: 1) + Kernel.warn( "#{self.class.name}#write is deprecated", uplevel: 1) indent(writer, indent) writer << START writer << @target diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index c771db70..033b740d 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -26,7 +26,7 @@ def previous_sibling_node # REXML::Formatters package for changing the output style. def to_s indent=nil unless indent.nil? - Kernel.warn( "#{self.class.name}.to_s(indent) parameter is deprecated", uplevel: 1) + Kernel.warn( "#{self.class.name}#to_s(indent) parameter is deprecated", uplevel: 1) f = REXML::Formatters::Pretty.new( indent ) f.write( self, rv = "" ) else diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 6f821472..e03ce9d1 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -287,7 +287,7 @@ def indent_text(string, level=1, style="\t", indentfirstline=true) # See REXML::Formatters # def write( writer, indent=-1, transitive=false, ie_hack=false ) - Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters", uplevel: 1) + Kernel.warn("#{self.class.name}#write is deprecated. See REXML::Formatters", uplevel: 1) formatter = if indent > -1 REXML::Formatters::Pretty.new( indent ) else From d427fc5914fcc17d7247c5ff9099ee38639d6702 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 07:20:21 +0700 Subject: [PATCH 36/45] Avoid redundant calls for doctype (#264) We can avoid calling `Document#doctype` by keeping `Document#doctype` result in a local variable. --- lib/rexml/element.rb | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 4311d58f..1c580577 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -2325,11 +2325,11 @@ def get_attribute( name ) return attr end end - element_document = @element.document - if element_document and element_document.doctype + doctype = @element.document&.doctype + if doctype expn = @element.expanded_name - expn = element_document.doctype.name if expn.size == 0 - attr_val = element_document.doctype.attribute_of(expn, name) + expn = doctype.name if expn.size == 0 + attr_val = doctype.attribute_of(expn, name) return Attribute.new( name, attr_val ) if attr_val end return nil @@ -2371,8 +2371,9 @@ def []=( name, value ) end unless value.kind_of? Attribute - if @element.document and @element.document.doctype - value = Text::normalize( value, @element.document.doctype ) + doctype = @element.document&.doctype + if doctype + value = Text::normalize( value, doctype ) else value = Text::normalize( value, nil ) end @@ -2409,10 +2410,11 @@ def prefixes each_attribute do |attribute| ns << attribute.name if attribute.prefix == 'xmlns' end - if @element.document and @element.document.doctype + doctype = @element.document&.doctype + if doctype expn = @element.expanded_name - expn = @element.document.doctype.name if expn.size == 0 - @element.document.doctype.attributes_of(expn).each { + expn = doctype.name if expn.size == 0 + doctype.attributes_of(expn).each { |attribute| ns << attribute.name if attribute.prefix == 'xmlns' } @@ -2434,10 +2436,11 @@ def namespaces each_attribute do |attribute| namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' end - if @element.document and @element.document.doctype + doctype = @element.document&.doctype + if doctype expn = @element.expanded_name - expn = @element.document.doctype.name if expn.size == 0 - @element.document.doctype.attributes_of(expn).each { + expn = doctype.name if expn.size == 0 + doctype.attributes_of(expn).each { |attribute| namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' } From 63f3e9772595a64b036953f0ab026d2ea5560a3b Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 07:21:10 +0700 Subject: [PATCH 37/45] Use Safe Navigation (&.) from Ruby 2.3 (#265) We can simplify our code by using `&.`. --- lib/rexml/attribute.rb | 5 +---- lib/rexml/child.rb | 3 +-- lib/rexml/doctype.rb | 11 +++-------- lib/rexml/element.rb | 3 +-- lib/rexml/text.rb | 5 +---- 5 files changed, 7 insertions(+), 20 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index ba49207c..1326563a 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -130,10 +130,7 @@ def to_string end def doctype - if @element - doc = @element.document - doc.doctype if doc - end + @element&.document&.doctype end # Returns the attribute value, with entities replaced diff --git a/lib/rexml/child.rb b/lib/rexml/child.rb index cc6e9a47..40abde87 100644 --- a/lib/rexml/child.rb +++ b/lib/rexml/child.rb @@ -83,8 +83,7 @@ def previous_sibling=(other) # Returns:: the document this child belongs to, or nil if this child # belongs to no document def document - return parent.document unless parent.nil? - nil + parent&.document end # This doesn't yet handle encodings diff --git a/lib/rexml/doctype.rb b/lib/rexml/doctype.rb index f3590484..a9cf9f7e 100644 --- a/lib/rexml/doctype.rb +++ b/lib/rexml/doctype.rb @@ -171,15 +171,11 @@ def write( output, indent=0, transitive=false, ie_hack=false ) end def context - if @parent - @parent.context - else - nil - end + @parent&.context end def entity( name ) - @entities[name].unnormalized if @entities[name] + @entities[name]&.unnormalized end def add child @@ -288,8 +284,7 @@ def initialize name, middle, pub, sys end def to_s - context = nil - context = parent.context if parent + context = parent&.context notation = "( other ) end def doctype - if @parent - doc = @parent.document - doc.doctype if doc - end + @parent&.document&.doctype end REFERENCE = /#{Entity::REFERENCE}/ From 66232eaf680d0937ae59bea285cdb8e4d3d88a93 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 08:12:02 +0700 Subject: [PATCH 38/45] Remove redundant return statements (#266) Very slight behavior change here in `REXML::Valdiation::Event#matches?`, which is to align the predicate method's return value with the expected behavior of a predicate method (which is to return one of true or false). --- lib/rexml/attribute.rb | 4 +-- lib/rexml/document.rb | 4 +-- lib/rexml/element.rb | 47 +++++++++++--------------- lib/rexml/functions.rb | 6 ++-- lib/rexml/namespace.rb | 8 ++--- lib/rexml/node.rb | 2 +- lib/rexml/parsers/baseparser.rb | 7 ++-- lib/rexml/parsers/xpathparser.rb | 8 ++--- lib/rexml/quickpath.rb | 37 +++++++++++---------- lib/rexml/security.rb | 4 +-- lib/rexml/text.rb | 14 ++++---- lib/rexml/validation/relaxng.rb | 53 +++++++++++++++--------------- lib/rexml/validation/validation.rb | 16 ++++----- lib/rexml/xpath_parser.rb | 38 ++++++++++----------- 14 files changed, 118 insertions(+), 130 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 1326563a..c5673249 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -199,9 +199,7 @@ def inspect end def xpath - path = @element.xpath - path += "/@#{self.expanded_name}" - return path + @element.xpath + "/@#{self.expanded_name}" end def document diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 1c678bef..96ae5b75 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -415,7 +415,7 @@ def Document::entity_expansion_limit=( val ) # # Deprecated. Use REXML::Security.entity_expansion_limit= instead. def Document::entity_expansion_limit - return Security.entity_expansion_limit + Security.entity_expansion_limit end # Set the entity expansion limit. By default the limit is set to 10240. @@ -429,7 +429,7 @@ def Document::entity_expansion_text_limit=( val ) # # Deprecated. Use REXML::Security.entity_expansion_text_limit instead. def Document::entity_expansion_text_limit - return Security.entity_expansion_text_limit + Security.entity_expansion_text_limit end attr_reader :entity_expansion_count diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index e9ca684e..0d74811e 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -565,7 +565,7 @@ def prefixes prefixes = [] prefixes = parent.prefixes if parent prefixes |= attributes.prefixes - return prefixes + prefixes end # :call-seq: @@ -624,7 +624,7 @@ def namespace(prefix=nil) ns = namespaces[prefix] ns = '' if ns.nil? and prefix == 'xmlns' - return ns + ns end # :call-seq: @@ -956,7 +956,7 @@ def get_elements( xpath ) def next_element element = next_sibling element = element.next_sibling until element.nil? or element.kind_of? Element - return element + element end # :call-seq: @@ -972,7 +972,7 @@ def next_element def previous_element element = previous_sibling element = element.previous_sibling until element.nil? or element.kind_of? Element - return element + element end @@ -1022,8 +1022,7 @@ def has_text? # def text( path = nil ) rv = get_text(path) - return rv.value unless rv.nil? - nil + rv&.value end # :call-seq: @@ -1051,7 +1050,7 @@ def get_text path = nil else rv = @children.find { |node| node.kind_of? Text } end - return rv + rv end # :call-seq: @@ -1095,7 +1094,7 @@ def text=( text ) old_text.replace_with( text ) end end - return self + self end # :call-seq: @@ -1146,7 +1145,7 @@ def add_text( text ) text = Text.new( text, whitespace(), nil, raw() ) end self << text unless text.nil? - return self + self end # :call-seq: @@ -1190,7 +1189,7 @@ def xpath cur = cur.parent path_elements << __to_xpath_helper( cur ) end - return path_elements.reverse.join( "/" ) + path_elements.reverse.join( "/" ) end ################################################# @@ -1292,7 +1291,6 @@ def attribute( name, namespace=nil ) return nil unless ( namespaces[ prefix ] == namespaces[ 'xmlns' ] ) attributes.get_attribute( name ) - end # :call-seq: @@ -1306,7 +1304,7 @@ def attribute( name, namespace=nil ) # b.has_attributes? # => false # def has_attributes? - return !@attributes.empty? + !@attributes.empty? end # :call-seq: @@ -1684,11 +1682,7 @@ def []( index, name=nil) (num += 1) == index } else - return XPath::first( @element, index ) - #{ |element| - # return element if element.kind_of? Element - #} - #return nil + XPath::first( @element, index ) end end @@ -1735,7 +1729,7 @@ def []=( index, element ) else previous.replace_with element end - return previous + previous end # :call-seq: @@ -1774,7 +1768,7 @@ def index element child == element end return rv if found == element - return -1 + -1 end # :call-seq: @@ -1853,7 +1847,7 @@ def delete_all( xpath ) @element.delete element element.remove end - return rv + rv end # :call-seq: @@ -2180,8 +2174,7 @@ def initialize element # def [](name) attr = get_attribute(name) - return attr.value unless attr.nil? - return nil + attr&.value end # :call-seq: @@ -2336,7 +2329,7 @@ def get_attribute( name ) if attr.kind_of? Hash attr = attr[ @element.prefix ] end - return attr + attr end # :call-seq: @@ -2390,7 +2383,7 @@ def []=( name, value ) else store value.name, value end - return @element + @element end # :call-seq: @@ -2494,9 +2487,7 @@ def delete( attribute ) old.each_value{|v| repl = v} store name, repl end - elsif old.nil? - return @element - else # the supplied attribute is a top-level one + elsif old # the supplied attribute is a top-level one super(name) end @element @@ -2550,7 +2541,7 @@ def delete_all( name ) rv << attribute if attribute.expanded_name == name } rv.each{ |attr| attr.remove } - return rv + rv end # :call-seq: diff --git a/lib/rexml/functions.rb b/lib/rexml/functions.rb index 4c114616..60ae34e7 100644 --- a/lib/rexml/functions.rb +++ b/lib/rexml/functions.rb @@ -39,11 +39,11 @@ def Functions::context=(value); @@context = value; end def Functions::text( ) if @@context[:node].node_type == :element - return @@context[:node].find_all{|n| n.node_type == :text}.collect{|n| n.value} + @@context[:node].find_all{|n| n.node_type == :text}.collect{|n| n.value} elsif @@context[:node].node_type == :text - return @@context[:node].value + @@context[:node].value else - return false + false end end diff --git a/lib/rexml/namespace.rb b/lib/rexml/namespace.rb index 2e67252a..232b7ca4 100644 --- a/lib/rexml/namespace.rb +++ b/lib/rexml/namespace.rb @@ -42,11 +42,11 @@ def name=( name ) # Compares names optionally WITH namespaces def has_name?( other, ns=nil ) if ns - return (namespace() == ns and name() == other) + namespace() == ns and name() == other elsif other.include? ":" - return fully_expanded_name == other + fully_expanded_name == other else - return name == other + name == other end end @@ -57,7 +57,7 @@ def has_name?( other, ns=nil ) def fully_expanded_name ns = prefix return "#{ns}:#@name" if ns.size > 0 - return @name + @name end end end diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index 033b740d..bccacc51 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -68,7 +68,7 @@ def find_first_recursive(&block) # :yields: node each_recursive {|node| return node if block.call(node) } - return nil + nil end # Returns the position that +self+ holds in its parent's array, indexed diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 750b1697..a87657b5 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -206,12 +206,12 @@ def position # Returns true if there are no more events def empty? - return (@source.empty? and @stack.empty?) + (@source.empty? and @stack.empty?) end # Returns true if there are more events. Synonymous with !empty? def has_next? - return !(@source.empty? and @stack.empty?) + !(@source.empty? and @stack.empty?) end # Push an event back on the head of the stream. This method @@ -522,7 +522,8 @@ def pull_event raise REXML::ParseException.new( "Exception parsing", @source, self, (error ? error : $!) ) end - return [ :dummy ] + # NOTE: The end of the method never runs, because it is unreachable. + # All branches of code above have explicit unconditional return or raise statements. end private :pull_event diff --git a/lib/rexml/parsers/xpathparser.rb b/lib/rexml/parsers/xpathparser.rb index bd3b6856..a6d76fdc 100644 --- a/lib/rexml/parsers/xpathparser.rb +++ b/lib/rexml/parsers/xpathparser.rb @@ -215,7 +215,7 @@ def predicate_to_path(parsed, &block) else path << yield( parsed ) end - return path.squeeze(" ") + path.squeeze(" ") end # For backward compatibility alias_method :preciate_to_string, :predicate_to_path @@ -252,7 +252,7 @@ def LocationPath path, parsed path = path[1..-1] end end - return RelativeLocationPath( path, parsed ) if path.size > 0 + RelativeLocationPath( path, parsed ) if path.size > 0 end #RelativeLocationPath @@ -388,7 +388,7 @@ def NodeTest path, parsed else path = original_path end - return path + path end # Filters the supplied nodeset on the predicate(s) @@ -600,7 +600,7 @@ def PathExpr path, parsed end rest = LocationPath(rest, n) if rest =~ /\A[\/\.\@\[\w*]/ parsed.concat(n) - return rest + rest end #| FilterExpr Predicate diff --git a/lib/rexml/quickpath.rb b/lib/rexml/quickpath.rb index a0466b25..cded06f5 100644 --- a/lib/rexml/quickpath.rb +++ b/lib/rexml/quickpath.rb @@ -41,7 +41,7 @@ def QuickPath::match element, path, namespaces=EMPTY_HASH else results = filter([element], path) end - return results + results end # Given an array of nodes it filters the array based on the path. The @@ -51,18 +51,18 @@ def QuickPath::filter elements, path return elements if path.nil? or path == '' or elements.size == 0 case path when /^\/\//u # Descendant - return axe( elements, "descendant-or-self", $' ) + axe( elements, "descendant-or-self", $' ) when /^\/?\b(\w[-\w]*)\b::/u # Axe - return axe( elements, $1, $' ) + axe( elements, $1, $' ) when /^\/(?=\b([:!\w][-\.\w]*:)?[-!\*\.\w]*\b([^:(]|$)|\*)/u # Child rest = $' results = [] elements.each do |element| results |= filter( element.to_a, rest ) end - return results + results when /^\/?(\w[-\w]*)\(/u # / Function - return function( elements, $1, $' ) + function( elements, $1, $' ) when Namespace::NAMESPLIT # Element name name = $2 ns = $1 @@ -73,21 +73,21 @@ def QuickPath::filter elements, path (element.name == name and element.namespace == Functions.namespace_context[ns]))) end - return filter( elements, rest ) + filter( elements, rest ) when /^\/\[/u matches = [] elements.each do |element| matches |= predicate( element.to_a, path[1..-1] ) if element.kind_of? Element end - return matches + matches when /^\[/u # Predicate - return predicate( elements, path ) + predicate( elements, path ) when /^\/?\.\.\./u # Ancestor - return axe( elements, "ancestor", $' ) + axe( elements, "ancestor", $' ) when /^\/?\.\./u # Parent - return filter( elements.collect{|e|e.parent}, $' ) + filter( elements.collect{|e|e.parent}, $' ) when /^\/?\./u # Self - return filter( elements, $' ) + filter( elements, $' ) when /^\*/u # Any results = [] elements.each do |element| @@ -98,9 +98,10 @@ def QuickPath::filter elements, path # results |= filter( children, $' ) #end end - return results + results + else + [] end - return [] end def QuickPath::axe( elements, axe_name, rest ) @@ -138,7 +139,7 @@ def QuickPath::axe( elements, axe_name, rest ) matches = filter(elements.collect{|element| element.previous_sibling}.uniq, rest ) end - return matches.uniq + matches.uniq end OPERAND_ = '((?=(?:(?!and|or).)*[^\s<>=])[^\s<>=]+)' @@ -200,15 +201,15 @@ def QuickPath::predicate( elements, path ) results << element end end - return filter( results, rest ) + filter( results, rest ) end def QuickPath::attribute( name ) - return Functions.node.attributes[name] if Functions.node.kind_of? Element + Functions.node.attributes[name] if Functions.node.kind_of? Element end def QuickPath::name() - return Functions.node.name if Functions.node.kind_of? Element + Functions.node.name if Functions.node.kind_of? Element end def QuickPath::method_missing( id, *args ) @@ -234,7 +235,7 @@ def QuickPath::function( elements, fname, rest ) results << element if Functions.pair[0] == res end end - return results + results end def QuickPath::parse_args( element, string ) diff --git a/lib/rexml/security.rb b/lib/rexml/security.rb index 99b74607..e8e8c6b4 100644 --- a/lib/rexml/security.rb +++ b/lib/rexml/security.rb @@ -10,7 +10,7 @@ def self.entity_expansion_limit=( val ) # Get the entity expansion limit. By default the limit is set to 10000. def self.entity_expansion_limit - return @@entity_expansion_limit + @@entity_expansion_limit end @@entity_expansion_text_limit = 10_240 @@ -22,7 +22,7 @@ def self.entity_expansion_text_limit=( val ) # Get the entity expansion limit. By default the limit is set to 10240. def self.entity_expansion_text_limit - return @@entity_expansion_text_limit + @@entity_expansion_text_limit end end end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index c70f73f2..8799d89d 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -177,7 +177,7 @@ def empty? def clone - return Text.new(self, true) + Text.new(self, true) end @@ -261,10 +261,10 @@ def wrap(string, width, addnewline=false) # Recursively wrap string at width. return string if string.length <= width place = string.rindex(' ', width) # Position in string with last ' ' before cutoff - if addnewline then - return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) + if addnewline + "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) else - return string[0,place] + "\n" + wrap(string[place+1..-1], width) + string[0,place] + "\n" + wrap(string[place+1..-1], width) end end @@ -277,7 +277,7 @@ def indent_text(string, level=1, style="\t", indentfirstline=true) new_string << new_line } new_string.strip! unless indentfirstline - return new_string + new_string end # == DEPRECATED @@ -296,9 +296,7 @@ def write( writer, indent=-1, transitive=false, ie_hack=false ) # FIXME # This probably won't work properly def xpath - path = @parent.xpath - path += "/text()" - return path + @parent.xpath + "/text()" end # Writes out text, substituting special characters beforehand. diff --git a/lib/rexml/validation/relaxng.rb b/lib/rexml/validation/relaxng.rb index f29a2c05..c6894dcb 100644 --- a/lib/rexml/validation/relaxng.rb +++ b/lib/rexml/validation/relaxng.rb @@ -157,16 +157,16 @@ def next( event ) if ( @events[@current].matches?(event) ) @current += 1 if @events[@current].nil? - return @previous.pop + @previous.pop elsif @events[@current].kind_of? State @current += 1 @events[@current-1].previous = self - return @events[@current-1] + @events[@current-1] else - return self + self end else - return nil + nil end end @@ -186,7 +186,7 @@ def inspect end def expected - return [@events[@current]] + [@events[@current]] end def <<( event ) @@ -244,7 +244,7 @@ def generate_event( event ) evt = :end_attribute end end - return Event.new( evt, arg ) + Event.new( evt, arg ) end end @@ -262,9 +262,10 @@ def next( event ) rv = super return rv if rv @prior = @previous.pop - return @prior.next( event ) + @prior.next( event ) + else + super end - super end def matches?(event) @@ -274,7 +275,7 @@ def matches?(event) def expected return [ @prior.expected, @events[0] ].flatten if @current == 0 - return [@events[@current]] + [@events[@current]] end end @@ -286,24 +287,24 @@ def next( event ) @current += 1 if @events[@current].nil? @current = 0 - return self + self elsif @events[@current].kind_of? State @current += 1 @events[@current-1].previous = self - return @events[@current-1] + @events[@current-1] else - return self + self end else @prior = @previous.pop return @prior.next( event ) if @current == 0 - return nil + nil end end def expected return [ @prior.expected, @events[0] ].flatten if @current == 0 - return [@events[@current]] + [@events[@current]] end end @@ -326,17 +327,17 @@ def next( event ) @ord += 1 if @events[@current].nil? @current = 0 - return self + self elsif @events[@current].kind_of? State @current += 1 @events[@current-1].previous = self - return @events[@current-1] + @events[@current-1] else - return self + self end else return @previous.pop.next( event ) if @current == 0 and @ord > 0 - return nil + nil end end @@ -347,9 +348,9 @@ def matches?( event ) def expected if @current == 0 and @ord > 0 - return [@previous[-1].expected, @events[0]].flatten + [@previous[-1].expected, @events[0]].flatten else - return [@events[@current]] + [@events[@current]] end end end @@ -403,7 +404,7 @@ def matches?( event ) def expected return [@events[@current]] if @events.size > 0 - return @choices.collect do |x| + @choices.collect do |x| if x[0].kind_of? State x[0].expected else @@ -490,16 +491,16 @@ def next( event ) @current += 1 if @events[@current].nil? return self unless @choices[@choice].nil? - return @previous.pop + @previous.pop elsif @events[@current].kind_of? State @current += 1 @events[@current-1].previous = self - return @events[@current-1] + @events[@current-1] else - return self + self end else - return nil + nil end end @@ -510,7 +511,7 @@ def matches?( event ) def expected return [@events[@current]] if @events[@current] - return @choices[@choice..-1].collect do |x| + @choices[@choice..-1].collect do |x| if x[0].kind_of? State x[0].expected else diff --git a/lib/rexml/validation/validation.rb b/lib/rexml/validation/validation.rb index 0ad6ada4..6475c628 100644 --- a/lib/rexml/validation/validation.rb +++ b/lib/rexml/validation/validation.rb @@ -80,26 +80,26 @@ def done? end def single? - return (@event_type != :start_element and @event_type != :start_attribute) + (@event_type != :start_element and @event_type != :start_attribute) end def matches?( event ) return false unless event[0] == @event_type case event[0] when nil - return true + true when :start_element - return true if event[1] == @event_arg + event[1] == @event_arg when :end_element - return true + true when :start_attribute - return true if event[1] == @event_arg + event[1] == @event_arg when :end_attribute - return true + true when :end_document - return true + true when :text - return (@event_arg.nil? or @event_arg == event[1]) + @event_arg.nil? || @event_arg == event[1] =begin when :processing_instruction false diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 70ae8919..5cf3f28c 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -114,7 +114,7 @@ def first( path_stack, node ) case path[0] when :document # do nothing - return first( path[1..-1], node ) + first( path[1..-1], node ) when :child for c in node.children r = first( path[1..-1], c ) @@ -124,9 +124,9 @@ def first( path_stack, node ) name = path[2] if node.name == name return node if path.size == 3 - return first( path[3..-1], node ) + first( path[3..-1], node ) else - return nil + nil end when :descendant_or_self r = first( path[1..-1], node ) @@ -136,11 +136,12 @@ def first( path_stack, node ) return r if r end when :node - return first( path[1..-1], node ) + first( path[1..-1], node ) when :any - return first( path[1..-1], node ) + first( path[1..-1], node ) + else + nil end - return nil end @@ -167,10 +168,10 @@ def strict? # 2. If no mapping was supplied, use the context node to look up the namespace def get_namespace( node, prefix ) if @namespaces - return @namespaces[prefix] || '' + @namespaces[prefix] || '' else return node.namespace( prefix ) if node.node_type == :element - return '' + '' end end @@ -757,22 +758,19 @@ def following(node) end def following_node_of( node ) - if node.kind_of? Element and node.children.size > 0 - return node.children[0] - end - return next_sibling_node(node) + return node.children[0] if node.kind_of?(Element) and node.children.size > 0 + + next_sibling_node(node) end def next_sibling_node(node) psn = node.next_sibling_node while psn.nil? - if node.parent.nil? or node.parent.class == Document - return nil - end + return nil if node.parent.nil? or node.parent.class == Document node = node.parent psn = node.next_sibling_node end - return psn + psn end def child(nodeset) @@ -805,13 +803,13 @@ def child(nodeset) def norm b case b when true, false - return b + b when 'true', 'false' - return Functions::boolean( b ) + Functions::boolean( b ) when /^\d+(\.\d+)?$/, Numeric - return Functions::number( b ) + Functions::number( b ) else - return Functions::string( b ) + Functions::string( b ) end end From 04a589a61bf4e366abee8764ee74b03f4aecc4aa Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 08:17:16 +0700 Subject: [PATCH 39/45] Fix a bug that XPath can't be used for no document element (#268) Fixes #267 #249 improved performance by introducing cache. It requires document but we should not break backward compatibility for performance improvement. This restores the previous behavior but no document case doesn't have performance improvement introduced by #249. --- lib/rexml/xpath_parser.rb | 7 ++++++- test/parser/test_xpath.rb | 2 +- test/test_xpath_parser.rb | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 test/test_xpath_parser.rb diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 5cf3f28c..64c8846a 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -84,7 +84,12 @@ def parse path, node node = node.first end - node.document.__send__(:enable_cache) do + document = node.document + if document + document.__send__(:enable_cache) do + match( path_stack, node ) + end + else match( path_stack, node ) end end diff --git a/test/parser/test_xpath.rb b/test/parser/test_xpath.rb index 9143d25c..5d62afee 100644 --- a/test/parser/test_xpath.rb +++ b/test/parser/test_xpath.rb @@ -4,7 +4,7 @@ require "rexml/parsers/xpathparser" module REXMLTests - class TestXPathParser < Test::Unit::TestCase + class TestParserXPathParser < Test::Unit::TestCase sub_test_case("#abbreviate") do def abbreviate(xpath) parser = REXML::Parsers::XPathParser.new diff --git a/test/test_xpath_parser.rb b/test/test_xpath_parser.rb new file mode 100644 index 00000000..bcb14c34 --- /dev/null +++ b/test/test_xpath_parser.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +module REXMLTests + class TestXPathParser < Test::Unit::TestCase + def setup + @root_element = make_service_element(["urn:type1", "urn:type2"], ["http://uri"]) + @element = @root_element.children[0] + @parser = REXML::XPathParser.new + end + + def make_service_element(types, uris) + root_element = REXML::Element.new + element = root_element.add_element("Service") + types.each do |type_text| + element.add_element("Type").text = type_text + end + uris.each do |uri_text| + element.add_element("URI").text = uri_text + end + root_element + end + + def test_found + res = @parser.parse("/Service", @root_element) + assert_equal([@element], + res) + end + + def test_not_found + res = @parser.parse("/nonexistent", @root_element) + assert_equal([], + res) + end + end +end From 9b084d78708638cedff54743edc0907c4bd6574a Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Tue, 15 Jul 2025 08:23:42 +0700 Subject: [PATCH 40/45] Fix & Deprecate REXML::Text#text_indent (#275) - Fixes #273 - "Fix" in the sense that it restores the original behavior pre-v3.2.6, regardless of its fitness for purpose. - Regression Test Added --------- Co-authored-by: Sutou Kouhei --- lib/rexml/child.rb | 2 +- lib/rexml/text.rb | 4 +++- test/test_text.rb | 8 ++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/rexml/child.rb b/lib/rexml/child.rb index 40abde87..2718040f 100644 --- a/lib/rexml/child.rb +++ b/lib/rexml/child.rb @@ -88,7 +88,7 @@ def document # This doesn't yet handle encodings def bytes - document.encoding + document&.encoding to_s end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 8799d89d..8d5281cd 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -269,8 +269,10 @@ def wrap(string, width, addnewline=false) end def indent_text(string, level=1, style="\t", indentfirstline=true) + Kernel.warn("#{self.class.name}#indent_text is deprecated. See REXML::Formatters", uplevel: 1) return string if level < 0 - new_string = '' + + new_string = +'' string.each_line { |line| indent_string = style * level new_line = (indent_string + line).sub(/[\s]+$/,'') diff --git a/test/test_text.rb b/test/test_text.rb index bae21656..c1f5765e 100644 --- a/test/test_text.rb +++ b/test/test_text.rb @@ -2,6 +2,7 @@ module REXMLTests class TextTester < Test::Unit::TestCase + include Helper::Global include REXML def test_new_text_response_whitespace_default @@ -69,5 +70,12 @@ def test_clone assert_equal(text.to_s, text.clone.to_s) end + + def test_indent_text + text = Text.new("") + suppress_warning do + assert_equal("\tline1\tline2\tline3", text.indent_text("line1\r\nline2\r\nline3\r\n")) + end + end end end From c60ae027a3c20f359fdf76fa41ae64d22313f482 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 16 Jul 2025 08:22:47 +0700 Subject: [PATCH 41/45] Remove bundler from dev deps (#277) Fixes #276 It's redundant. --- Gemfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Gemfile b/Gemfile index d323e2c5..a680c133 100644 --- a/Gemfile +++ b/Gemfile @@ -6,7 +6,6 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } gemspec group :development do - gem "bundler" # This is for suppressing the following warning: # # warning: ostruct was loaded from the standard library, but will From c87bda8bb8773da7e5a0faf9f16ff165eb052a35 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 16 Jul 2025 08:39:31 +0700 Subject: [PATCH 42/45] Remove ostruct from dev deps (#281) Fixes #280 It seems that it's no longer needed. --- Gemfile | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Gemfile b/Gemfile index a680c133..22520c65 100644 --- a/Gemfile +++ b/Gemfile @@ -6,14 +6,6 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } gemspec group :development do - # This is for suppressing the following warning: - # - # warning: ostruct was loaded from the standard library, but will - # no longer be part of the default gems starting from Ruby 3.5.0. - # - # This should be part of "json". We can remove this when "json" - # depends on "ostruct" explicitly. - gem "ostruct" gem "rake" gem "rdoc" end From 1d876e3bf658b7b4ec7c3372867521695e8eb023 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 19 Aug 2025 09:38:38 +0900 Subject: [PATCH 43/45] Bump actions/checkout from 4 to 5 (#283) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/checkout](https://github.com/actions/checkout) from 4 to 5.
Release notes

Sourced from actions/checkout's releases.

v5.0.0

What's Changed

⚠️ Minimum Compatible Runner Version

v2.327.1
Release Notes

Make sure your runner is updated to this version or newer to use this release.

Full Changelog: https://github.com/actions/checkout/compare/v4...v5.0.0

v4.3.0

What's Changed

New Contributors

Full Changelog: https://github.com/actions/checkout/compare/v4...v4.3.0

v4.2.2

What's Changed

Full Changelog: https://github.com/actions/checkout/compare/v4.2.1...v4.2.2

v4.2.1

What's Changed

New Contributors

Full Changelog: https://github.com/actions/checkout/compare/v4.2.0...v4.2.1

... (truncated)

Changelog

Sourced from actions/checkout's changelog.

Changelog

V5.0.0

V4.3.0

v4.2.2

v4.2.1

v4.2.0

v4.1.7

v4.1.6

v4.1.5

v4.1.4

v4.1.3

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/checkout&package-manager=github_actions&previous-version=4&new-version=5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/benchmark.yml | 2 +- .github/workflows/release.yml | 4 ++-- .github/workflows/test.yml | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 2c638b03..651df879 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -16,7 +16,7 @@ jobs: - ubuntu-latest runs-on: ${{ matrix.runs-on }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 76269f44..f3dffca7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 10 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Extract release note run: | ruby \ @@ -37,7 +37,7 @@ jobs: id-token: write environment: release steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ruby diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0bd43457..31dc02a2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,7 +27,7 @@ jobs: # - runs-on: ubuntu-latest # ruby-version: truffleruby steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} @@ -39,7 +39,7 @@ jobs: name: frozen-string-literal runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ruby @@ -66,7 +66,7 @@ jobs: - windows-latest ruby-version: ${{ fromJson(needs.ruby-versions-gems.outputs.versions) }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} @@ -95,7 +95,7 @@ jobs: name: "Document" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ruby @@ -105,7 +105,7 @@ jobs: - name: Build document run: | bundle exec rake warning:error rdoc - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 if: | github.event_name == 'push' with: From 5859bdeac792687eaf93d8e8f0b7e3c1e2ed5c23 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 23 Aug 2025 08:11:58 +0900 Subject: [PATCH 44/45] Added XML declaration check & `Source#skip_spaces` method (#282) ## Why? ### Added XML declaration check - The version attribute is required in XML declaration. - Only version attribute, encoding attribute, and standalone attribute are allowed in XML declaration. - XML declaration is only allowed once. See: https://www.w3.org/TR/xml/#NT-XMLDecl ### Added `Source#skip_spaces` method In the case of `@source.match?(/\s+/um, true)`, if there are no spaces at the beginning, I want to stop reading immediately. However, it continues to read the buffer until it finds a match, but it never finds a match. As a result, it continues reading until the end of the file. In the case of large XML files, drop_parsed_content occur frequently until the buffer is cleared, which may affect performance. ## Benchmark ``` before after before(YJIT) after(YJIT) dom 32.534 35.130 54.559 53.528 i/s - 100.000 times in 3.073715s 2.846540s 1.832883s 1.868189s sax 44.785 44.089 78.303 77.842 i/s - 100.000 times in 2.232907s 2.268138s 1.277093s 1.284657s pull 51.750 51.105 90.819 90.658 i/s - 100.000 times in 1.932351s 1.956759s 1.101094s 1.103050s stream 51.427 51.444 89.820 88.971 i/s - 100.000 times in 1.944502s 1.943855s 1.113340s 1.123960s Comparison: dom before(YJIT): 54.6 i/s after(YJIT): 53.5 i/s - 1.02x slower after: 35.1 i/s - 1.55x slower before: 32.5 i/s - 1.68x slower sax before(YJIT): 78.3 i/s after(YJIT): 77.8 i/s - 1.01x slower before: 44.8 i/s - 1.75x slower after: 44.1 i/s - 1.78x slower pull before(YJIT): 90.8 i/s after(YJIT): 90.7 i/s - 1.00x slower before: 51.8 i/s - 1.75x slower after: 51.1 i/s - 1.78x slower stream before(YJIT): 89.8 i/s after(YJIT): 89.0 i/s - 1.01x slower after: 51.4 i/s - 1.75x slower before: 51.4 i/s - 1.75x slower ``` - YJIT=ON : 0.98x - 1.00x faster - YJIT=OFF : 0.98x - 1.07x faster --- lib/rexml/parsers/baseparser.rb | 156 +++++++++++++------ lib/rexml/source.rb | 7 +- test/parse/test_document_type_declaration.rb | 6 +- test/parse/test_processing_instruction.rb | 130 +++++++++++++++- test/test_xml_declaration.rb | 2 +- 5 files changed, 244 insertions(+), 57 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index a87657b5..9304e96d 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -144,6 +144,7 @@ module Private PEREFERENCE_PATTERN = /#{PEREFERENCE}/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um + EQUAL_PATTERN = /\s*=\s*/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um NAME_PATTERN = /#{NAME}/um GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" @@ -168,6 +169,7 @@ def initialize( source ) @entity_expansion_limit = Security.entity_expansion_limit @entity_expansion_text_limit = Security.entity_expansion_text_limit @source.ensure_buffer + @version = nil end def add_listener( listener ) @@ -280,7 +282,7 @@ def pull_event return [ :comment, process_comment ] elsif @source.match?("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" - unless @source.match?(/\s+/um, true) + unless @source.skip_spaces if @source.match?(">") message = "#{base_error_message}: name is missing" else @@ -290,7 +292,7 @@ def pull_event raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces if @source.match?("[", true) id = [nil, nil, nil] @document_status = :in_doctype @@ -306,7 +308,7 @@ def pull_event # For backward compatibility id[1], id[2] = id[2], nil end - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces if @source.match?("[", true) @document_status = :in_doctype elsif @source.match?(">", true) @@ -319,7 +321,7 @@ def pull_event end args = [:start_doctype, name, *id] if @document_status == :after_doctype - @source.match?(/\s*/um, true) + @source.skip_spaces @stack << [ :end_doctype ] end return args @@ -330,7 +332,7 @@ def pull_event end end if @document_status == :in_doctype - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces start_position = @source.position if @source.match?("") message = "#{base_error_message}: name is missing" else @@ -404,7 +406,7 @@ def pull_event id = parse_id(base_error_message, accept_external_id: true, accept_public_id: true) - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces unless @source.match?(">", true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) @@ -425,7 +427,7 @@ def pull_event end end if @document_status == :after_doctype - @source.match?(/\s*/um, true) + @source.skip_spaces end begin start_position = @source.position @@ -642,6 +644,10 @@ def need_source_encoding_update?(xml_declaration_encoding) true end + def normalize_xml_declaration_encoding(xml_declaration_encoding) + /\AUTF-16(?:BE|LE)\z/i.match?(xml_declaration_encoding) ? "UTF-16" : nil + end + def parse_name(base_error_message) md = @source.match(Private::NAME_PATTERN, true) unless md @@ -735,37 +741,85 @@ def process_comment def process_instruction name = parse_name("Malformed XML: Invalid processing instruction node") - if @source.match?(/\s+/um, true) - match_data = @source.match(/(.*?)\?>/um, true) - unless match_data - raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + if name == "xml" + xml_declaration + else # PITarget + if @source.skip_spaces # e.g. + start_position = @source.position + content = @source.read_until("?>") + unless content.chomp!("?>") + @source.position = start_position + raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source) + end + else # e.g. + content = nil + unless @source.match?("?>", true) + raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source) + end end - content = match_data[1] - else - content = nil + [:processing_instruction, name, content] + end + end + + def xml_declaration + unless @version.nil? + raise ParseException.new("Malformed XML: XML declaration is duplicated", @source) + end + if @document_status + raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) + end + unless @source.skip_spaces + raise ParseException.new("Malformed XML: XML declaration misses spaces before version", @source) + end + unless @source.match?("version", true) + raise ParseException.new("Malformed XML: XML declaration misses version", @source) + end + @version = parse_attribute_value_with_equal("xml") + unless @source.skip_spaces unless @source.match?("?>", true) - raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + raise ParseException.new("Malformed XML: Unclosed XML declaration", @source) end + encoding = normalize_xml_declaration_encoding(@source.encoding) + return [ :xmldecl, @version, encoding, nil ] # e.g. end - if name == "xml" - if @document_status - raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) - end - version = VERSION.match(content) - version = version[1] unless version.nil? - encoding = ENCODING.match(content) - encoding = encoding[1] unless encoding.nil? - if need_source_encoding_update?(encoding) - @source.encoding = encoding + + if @source.match?("encoding", true) + encoding = parse_attribute_value_with_equal("xml") + unless @source.skip_spaces + unless @source.match?("?>", true) + raise ParseException.new("Malformed XML: Unclosed XML declaration", @source) + end + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + encoding ||= normalize_xml_declaration_encoding(@source.encoding) + return [ :xmldecl, @version, encoding, nil ] # e.g. end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" + end + + if @source.match?("standalone", true) + standalone = parse_attribute_value_with_equal("xml") + case standalone + when "yes", "no" + else + raise ParseException.new("Malformed XML: XML declaration standalone is not yes or no : <#{standalone}>", @source) end - standalone = STANDALONE.match(content) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone ] end - [:processing_instruction, name, content] + @source.skip_spaces + unless @source.match?("?>", true) + raise ParseException.new("Malformed XML: Unclosed XML declaration", @source) + end + + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + encoding ||= normalize_xml_declaration_encoding(@source.encoding) + + # e.g. + # + # + # + [ :xmldecl, @version, encoding, standalone ] end if StringScanner::Version < "3.1.1" @@ -787,6 +841,25 @@ def scan_quote end end + def parse_attribute_value_with_equal(name) + unless @source.match?(Private::EQUAL_PATTERN, true) + message = "Missing attribute equal: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + unless quote = scan_quote + message = "Missing attribute value start quote: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + start_position = @source.position + value = @source.read_until(quote) + unless value.chomp!(quote) + @source.position = start_position + message = "Missing attribute value end quote: <#{name}>: <#{quote}>" + raise REXML::ParseException.new(message, @source) + end + value + end + def parse_attributes(prefixes) attributes = {} expanded_names = {} @@ -801,23 +874,8 @@ def parse_attributes(prefixes) name = match[1] prefix = match[2] local_part = match[3] - - unless @source.match?(/\s*=\s*/um, true) - message = "Missing attribute equal: <#{name}>" - raise REXML::ParseException.new(message, @source) - end - unless quote = scan_quote - message = "Missing attribute value start quote: <#{name}>" - raise REXML::ParseException.new(message, @source) - end - start_position = @source.position - value = @source.read_until(quote) - unless value.chomp!(quote) - @source.position = start_position - message = "Missing attribute value end quote: <#{name}>: <#{quote}>" - raise REXML::ParseException.new(message, @source) - end - @source.match?(/\s*/um, true) + value = parse_attribute_value_with_equal(name) + @source.skip_spaces if prefix == "xmlns" if local_part == "xml" if value != Private::XML_PREFIXED_NAMESPACE diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 3ec1141e..99500072 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -65,9 +65,10 @@ class Source attr_reader :encoding module Private + SPACES_PATTERN = /\s+/um SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} - pre_defined_terms = ["'", '"', "<", "]]>"] + pre_defined_terms = ["'", '"', "<", "]]>", "?>"] if StringScanner::Version < "3.1.1" pre_defined_terms.each do |term| PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ @@ -150,6 +151,10 @@ def match?(pattern, cons=false) end end + def skip_spaces + @scanner.skip(Private::SPACES_PATTERN) ? true : false + end + def position @scanner.pos end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index b22863a9..d4658b9e 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -49,10 +49,10 @@ def test_no_name end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed DOCTYPE: name is missing -Line: 3 -Position: 17 +Line: 1 +Position: 10 Last 80 unconsumed characters: - + DETAIL end end diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index ba381dc4..70d17747 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -30,7 +30,7 @@ def test_unclosed_content parse(" Line: 1 Position: 14 Last 80 unconsumed characters: @@ -43,7 +43,7 @@ def test_unclosed_no_content parse(" Line: 1 Position: 6 Last 80 unconsumed characters: @@ -51,6 +51,19 @@ def test_unclosed_no_content DETAIL end + def test_xml_declaration_duplicated + exception = assert_raise(REXML::ParseException) do + parse('') + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed XML: XML declaration is duplicated +Line: 1 +Position: 42 +Last 80 unconsumed characters: + version="1.0"?> + DETAIL + end + def test_xml_declaration_not_at_document_start exception = assert_raise(REXML::ParseException) do parser = REXML::Parsers::BaseParser.new('') @@ -64,7 +77,118 @@ def test_xml_declaration_not_at_document_start Line: 1 Position: 25 Last 80 unconsumed characters: + version="1.0" ?> + DETAIL + end + + def test_xml_declaration_missing_spaces + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration misses spaces before version + Line: 1 + Position: 7 + Last 80 unconsumed characters: + ?> + DETAIL + end + + def test_xml_declaration_missing_version + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration misses version + Line: 1 + Position: 8 + Last 80 unconsumed characters: + ?> + DETAIL + end + + def test_xml_declaration_unclosed_content + exception = assert_raise(REXML::ParseException) do + parse('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Unclosed XML declaration + Line: 1 + Position: 37 + Last 80 unconsumed characters: + encoding="UTF-8"?> + DETAIL + end + + def test_xml_declaration_unclosed_content_missing_space_after_encoding + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Unclosed XML declaration + Line: 1 + Position: 53 + Last 80 unconsumed characters: + standalone="no"?> + DETAIL + end + + def test_xml_declaration_unclosed_content_with_unknown_attributes + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Unclosed XML declaration + Line: 1 + Position: 31 + Last 80 unconsumed characters: + test="no"?> + DETAIL + end + + def test_xml_declaration_standalone_no_yes_or_no + exception = assert_raise(REXML::ParseException) do + parse('') + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed XML: XML declaration standalone is not yes or no : +Line: 1 +Position: 38 +Last 80 unconsumed characters: +?> DETAIL end end @@ -113,7 +237,7 @@ def test_content_question def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new("" * n + " ?>") + REXML::Document.new("" * n + " ?>") end end diff --git a/test/test_xml_declaration.rb b/test/test_xml_declaration.rb index 6a1f4df0..4503a90e 100644 --- a/test/test_xml_declaration.rb +++ b/test/test_xml_declaration.rb @@ -7,7 +7,7 @@ module REXMLTests class TestXmlDeclaration < Test::Unit::TestCase def setup xml = <<~XML - + XML From f36916fe1c66b8cdc1fe482263115625e084d8fe Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 26 Aug 2025 14:29:47 +0900 Subject: [PATCH 45/45] Add 3.4.2 entry (#284) --- NEWS.md | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/NEWS.md b/NEWS.md index 7f95d829..313b07d5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,107 @@ # News +## 3.4.2 - 2025-08-26 {#version-3-4-2} + +### Improvement + + * Improved performance. + * GH-244 + * GH-245 + * GH-246 + * GH-249 + * GH-256 + * Patch by NAITOH Jun + + * Raise appropriate exception when failing to match start tag in DOCTYPE + * GH-247 + * Patch by NAITOH Jun + + * Deprecate accepting array as an element in XPath.match, first and each + * GH-252 + * Patch by tomoya ishida + + * Don't call needless encoding_updated + * GH-259 + * Patch by Sutou Kouhei + + * Reuse XPath::match + * GH-263 + * Patch by pboling + + * Cache redundant calls for doctype + * GH-264 + * Patch by pboling + + * Use Safe Navigation (&.) from Ruby 2.3 + * GH-265 + * Patch by pboling + + * Remove redundant return statements + * GH-266 + * Patch by pboling + + * Added XML declaration check & Source#skip_spaces method + * GH-282 + * Patch by NAITOH Jun + * Reported by Sofi Aberegg + +### Fixes + + * Fix docs typo + * GH-248 + * Patch by James Coleman + + * Fix reverse sort in xpath_parser + * GH-251 + * Patch by tomoya ishida + + * Fix duplicate responses in XPath following, following-sibling, preceding, preceding-sibling + * GH-255 + * Patch by NAITOH Jun + + * Fix wrong Encoding resolution + * GH-258 + * Patch by Sutou Kouhei + + * Handle nil when parsing fragment + * GH-267 + * GH-268 + * Patch by pboling + + * [Documentation] Use # to reference instance methods + * GH-269 + * GH-270 + * Patch by pboling + + * Fix & Deprecate REXML::Text#text_indent + * GH-273 + * GH-275 + * Patch by pboling + + * remove bundler from dev deps + * GH-276 + * GH-277 + * Patch by pboling + + * remove ostruct from dev deps + * GH-280 + * GH-281 + * Patch by pboling + +### Thanks + + * NAITOH Jun + + * tomoya ishida + + * James Coleman + + * pboling + + * Sutou Kouhei + + * Sofi Aberegg + ## 3.4.1 - 2025-02-16 {#version-3-4-1} ### Improvement