From a1d875b23340df6b33d3bbe6b17cca807eb0e3d2 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 15 Dec 2024 11:19:55 +0900 Subject: [PATCH 01/34] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 42623b08..a653f028 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.4.0" + VERSION = "3.4.1" REVISION = "" Copyright = COPYRIGHT From bb0bedd25dbb69b247b0894a6c357f8903a2b9a2 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 19 Dec 2024 11:18:52 +0900 Subject: [PATCH 02/34] Optimize `IOSource#read_until` method by using `StringScanner#check_until(string)` (#226) ## Why? `StringScanner#check_until(string)` is faster than `StringScanner#check_until(regex)`. See: - https://github.com/ruby/strscan/pull/106 - https://github.com/ruby/strscan/pull/111 ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 19.459 19.840 35.035 35.786 i/s - 100.000 times in 5.139034s 5.040369s 2.854304s 2.794367s sax 30.057 30.026 52.986 53.716 i/s - 100.000 times in 3.326998s 3.330499s 1.887303s 1.861652s pull 33.777 34.415 62.294 64.020 i/s - 100.000 times in 2.960622s 2.905668s 1.605284s 1.562002s stream 33.789 34.003 60.174 60.411 i/s - 100.000 times in 2.959521s 2.940916s 1.661845s 1.655334s Comparison: dom after(YJIT): 35.8 i/s before(YJIT): 35.0 i/s - 1.02x slower after: 19.8 i/s - 1.80x slower before: 19.5 i/s - 1.84x slower sax after(YJIT): 53.7 i/s before(YJIT): 53.0 i/s - 1.01x slower before: 30.1 i/s - 1.79x slower after: 30.0 i/s - 1.79x slower pull after(YJIT): 64.0 i/s before(YJIT): 62.3 i/s - 1.03x slower after: 34.4 i/s - 1.86x slower before: 33.8 i/s - 1.90x slower stream after(YJIT): 60.4 i/s before(YJIT): 60.2 i/s - 1.00x slower after: 34.0 i/s - 1.78x slower before: 33.8 i/s - 1.79x slower ``` - YJIT=ON : 1.00x - 1.03x faster - YJIT=OFF : 1.00x - 1.02x faster --- lib/rexml/source.rb | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index b0b89b71..2409f76e 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -68,8 +68,14 @@ module Private SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} pre_defined_terms = ["'", '"', "<"] - pre_defined_terms.each do |term| - PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ + if StringScanner::Version < "3.1.1" + pre_defined_terms.each do |term| + PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ + end + else + pre_defined_terms.each do |term| + PRE_DEFINED_TERM_PATTERNS[term] = term + end end end private_constant :Private From b70388c2638d90ebd2ae471bd85239d8469b8e62 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 21 Dec 2024 07:59:47 +0900 Subject: [PATCH 03/34] Use `StringScanner#peek_byte` to get double or single quotation mark (#227) ## Why? `StringScanner#peek_byte` is fast, because it does not generate String object. ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 19.753 19.888 35.641 35.928 i/s - 100.000 times in 5.062402s 5.028121s 2.805792s 2.783339s sax 30.349 30.978 53.485 57.885 i/s - 100.000 times in 3.295012s 3.228103s 1.869671s 1.727567s pull 34.170 35.436 61.713 66.534 i/s - 100.000 times in 2.926534s 2.821955s 1.620404s 1.502996s stream 33.121 35.268 60.751 63.276 i/s - 100.000 times in 3.019222s 2.835443s 1.646065s 1.580374s Comparison: dom after(YJIT): 35.9 i/s before(YJIT): 35.6 i/s - 1.01x slower after: 19.9 i/s - 1.81x slower before: 19.8 i/s - 1.82x slower sax after(YJIT): 57.9 i/s before(YJIT): 53.5 i/s - 1.08x slower after: 31.0 i/s - 1.87x slower before: 30.3 i/s - 1.91x slower pull after(YJIT): 66.5 i/s before(YJIT): 61.7 i/s - 1.08x slower after: 35.4 i/s - 1.88x slower before: 34.2 i/s - 1.95x slower stream after(YJIT): 63.3 i/s before(YJIT): 60.8 i/s - 1.04x slower after: 35.3 i/s - 1.79x slower before: 33.1 i/s - 1.91x slower ``` - YJIT=ON : 1.01x - 1.08x faster - YJIT=OFF : 1.00x - 1.06x faster Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 22 ++++++++++++++++++++-- lib/rexml/source.rb | 8 ++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 90851bb1..13cdd821 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -766,6 +766,25 @@ def process_instruction [:processing_instruction, name, content] end + if StringScanner::Version < "3.1.1" + def scan_quote + @source.match(/(['"])/, true)&.[](1) + end + else + def scan_quote + case @source.peek_byte + when 34 # '"'.ord + @source.scan_byte + '"' + when 39 # "'".ord + @source.scan_byte + "'" + else + nil + end + end + end + def parse_attributes(prefixes) attributes = {} expanded_names = {} @@ -785,11 +804,10 @@ def parse_attributes(prefixes) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end - unless match = @source.match(/(['"])/, true) + unless quote = scan_quote message = "Missing attribute value start quote: <#{name}>" raise REXML::ParseException.new(message, @source) end - quote = match[1] start_position = @source.position value = @source.read_until(quote) unless value.chomp!(quote) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 2409f76e..5ba5ab12 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -158,6 +158,14 @@ def position=(pos) @scanner.pos = pos end + def peek_byte + @scanner.peek_byte + end + + def scan_byte + @scanner.scan_byte + end + # @return true if the Source is exhausted def empty? @scanner.eos? From a4bf93a65e03c6bf26c688a8a616ad135f89244f Mon Sep 17 00:00:00 2001 From: OlofKalufs Date: Mon, 20 Jan 2025 15:38:08 +0100 Subject: [PATCH 04/34] Added rdoc as a development dependency (for Ruby 3.5+) (#235) Ruby 3.5+ requires that rdoc explicitly be declared as a dependency Should sort out GitHub Actions that are failing due to this --- Gemfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Gemfile b/Gemfile index 1710ec99..d323e2c5 100644 --- a/Gemfile +++ b/Gemfile @@ -16,6 +16,7 @@ group :development do # depends on "ostruct" explicitly. gem "ostruct" gem "rake" + gem "rdoc" end group :benchmark do From 107e273337b2e2160d6b0b15e10d0a9da0b9e164 Mon Sep 17 00:00:00 2001 From: OlofKalufs Date: Mon, 20 Jan 2025 23:13:00 +0100 Subject: [PATCH 05/34] Fix serialization of ATTLIST is incorrect (#234) GitHub: fix #233 Changed so that " EOL + assert_equal '', doc.doctype.children[0].to_s.gsub(/\s+/, " ") assert_equal 'gobble', doc.root.attributes['bar'] assert_equal 'xxx', doc.root.elements[2].namespace assert_equal 'two', doc.root.elements[1].namespace From f63c510287d29c2d6261ad94a641cb93f731be4a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Fri, 24 Jan 2025 09:55:57 +0900 Subject: [PATCH 06/34] Changed benchmark target to Ruby latest (#236) Ruby 3.4 has been released, we will change our benchmark target to Ruby latest(3.4). Co-authored-by: Sutou Kouhei --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 52349b44..2c638b03 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -11,7 +11,7 @@ jobs: fail-fast: false matrix: ruby-version: - - '3.3' + - 'ruby' runs-on: - ubuntu-latest runs-on: ${{ matrix.runs-on }} From 67d21be36c87d23b7a00c4f50017d9db977319d2 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 26 Jan 2025 19:56:59 +0900 Subject: [PATCH 07/34] Reduced regular expression processing in the form of processing white space first (#237) ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.4.1/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +PRISM [arm64-darwin24] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 19.849 20.109 36.064 38.655 i/s - 100.000 times in 5.038102s 4.972864s 2.772838s 2.586981s sax 30.339 30.449 52.946 54.873 i/s - 100.000 times in 3.296102s 3.284176s 1.888722s 1.822391s pull 34.785 34.916 65.808 65.219 i/s - 100.000 times in 2.874810s 2.863976s 1.519581s 1.533305s stream 34.766 34.921 61.920 63.277 i/s - 100.000 times in 2.876359s 2.863571s 1.615000s 1.580354s Comparison: dom after(YJIT): 38.7 i/s before(YJIT): 36.1 i/s - 1.07x slower after: 20.1 i/s - 1.92x slower before: 19.8 i/s - 1.95x slower sax after(YJIT): 54.9 i/s before(YJIT): 52.9 i/s - 1.04x slower after: 30.4 i/s - 1.80x slower before: 30.3 i/s - 1.81x slower pull before(YJIT): 65.8 i/s after(YJIT): 65.2 i/s - 1.01x slower after: 34.9 i/s - 1.88x slower before: 34.8 i/s - 1.89x slower stream after(YJIT): 63.3 i/s before(YJIT): 61.9 i/s - 1.02x slower after: 34.9 i/s - 1.81x slower before: 34.8 i/s - 1.82x slower ``` - YJIT=ON : 0.99x - 1.07x faster - YJIT=OFF : 1.00x - 1.01x faster --- lib/rexml/parsers/baseparser.rb | 13 ++++++++----- test/parse/test_document_type_declaration.rb | 10 +++++----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 87f50f09..44aacfa2 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -297,10 +297,11 @@ def pull_event raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) - if @source.match?(/\s*\[/um, true) + @source.match?(/\s*/um, true) # skip spaces + if @source.match?("[", true) id = [nil, nil, nil] @document_status = :in_doctype - elsif @source.match?(/\s*>/um, true) + elsif @source.match?(">", true) id = [nil, nil, nil] @document_status = :after_doctype @source.ensure_buffer @@ -312,9 +313,10 @@ def pull_event # For backward compatibility id[1], id[2] = id[2], nil end - if @source.match?(/\s*\[/um, true) + @source.match?(/\s*/um, true) # skip spaces + if @source.match?("[", true) @document_status = :in_doctype - elsif @source.match?(/\s*>/um, true) + elsif @source.match?(">", true) @document_status = :after_doctype @source.ensure_buffer else @@ -409,7 +411,8 @@ def pull_event id = parse_id(base_error_message, accept_external_id: true, accept_public_id: true) - unless @source.match?(/\s*>/um, true) + @source.match?(/\s*/um, true) # skip spaces + unless @source.match?(">", true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 99c23745..b22863a9 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -153,7 +153,7 @@ def test_no_literal Line: 3 Position: 26 Last 80 unconsumed characters: - SYSTEM> +SYSTEM> DETAIL end @@ -200,7 +200,7 @@ def test_content_double_quote Line: 3 Position: 62 Last 80 unconsumed characters: - PUBLIC 'double quote " is invalid' "r.dtd"> +PUBLIC 'double quote " is invalid' "r.dtd"> DETAIL end @@ -228,10 +228,10 @@ def test_garbage_after_literal end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed DOCTYPE: garbage after external ID -Line: 3 -Position: 65 +Line: 1 +Position: 58 Last 80 unconsumed characters: -x'> +x'> DETAIL end From bfb37e9ca4cb974c9bb2dc2f06e1202719d1bc4d Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 16 Feb 2025 10:57:37 +0900 Subject: [PATCH 08/34] Add 3.4.1 entry (#239) --- NEWS.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/NEWS.md b/NEWS.md index f25a33f2..51a45cab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,29 @@ # News +## 3.4.1 - 2025-02-16 {#version-3-4-1} + +### Improvement + + * Improved performance. + * GH-226 + * GH-227 + * GH-237 + * Patch by NAITOH Jun + +### Fixes + + * Fix serialization of ATTLIST is incorrect + * GH-233 + * GH-234 + * Patch by OlofKalufs + * Reported by OlofKalufs + +### Thanks + + * NAITOH Jun + + * OlofKalufs + ## 3.4.0 - 2024-12-15 {#version-3-4-0} ### Improvement From b97e454ceb2e1719a487bfebaae3da4a706a854b Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 16 Feb 2025 16:48:06 +0900 Subject: [PATCH 09/34] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index a653f028..bf3c0d32 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.4.1" + VERSION = "3.4.2" REVISION = "" Copyright = COPYRIGHT From 64a709e74551d5968f2241a772876f4b0c8dea22 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 2 Mar 2025 11:38:54 +0900 Subject: [PATCH 10/34] Improve CDATA parse performance (#244) ## Why? GitHub: fix #243 ## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/parse_cdata.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) dom 648.361 1.178k 591.590 1.046k i/s - 100.000 times in 0.154235s 0.084913s 0.169036s 0.095627s sax 699.061 1.378k 651.148 1.196k i/s - 100.000 times in 0.143049s 0.072549s 0.153575s 0.083611s pull 699.271 1.379k 660.275 1.210k i/s - 100.000 times in 0.143006s 0.072527s 0.151452s 0.082622s stream 701.725 1.383k 659.483 1.228k i/s - 100.000 times in 0.142506s 0.072307s 0.151634s 0.081455s Comparison: dom master: 1177.7 i/s master(YJIT): 1045.7 i/s - 1.13x slower rexml 3.4.1: 648.4 i/s - 1.82x slower 3.4.1(YJIT): 591.6 i/s - 1.99x slower sax master: 1378.4 i/s master(YJIT): 1196.0 i/s - 1.15x slower rexml 3.4.1: 699.1 i/s - 1.97x slower 3.4.1(YJIT): 651.1 i/s - 2.12x slower pull master: 1378.8 i/s master(YJIT): 1210.3 i/s - 1.14x slower rexml 3.4.1: 699.3 i/s - 1.97x slower 3.4.1(YJIT): 660.3 i/s - 2.09x slower stream master: 1383.0 i/s master(YJIT): 1227.7 i/s - 1.13x slower rexml 3.4.1: 701.7 i/s - 1.97x slower 3.4.1(YJIT): 659.5 i/s - 2.10x slower ``` - YJIT=ON : 1.76x - 1.83x faster - YJIT=OFF : 1.82x - 1.97x faster Reported by Masamune. Thanks!!! Co-authored-by: Sutou Kouhei --- benchmark/parse_cdata.yaml | 50 +++++++++++++++++++++++++++++++++ lib/rexml/parsers/baseparser.rb | 10 +++++-- lib/rexml/source.rb | 2 +- test/parse/test_cdata.rb | 20 ++++++++++++- 4 files changed, 77 insertions(+), 5 deletions(-) create mode 100644 benchmark/parse_cdata.yaml diff --git a/benchmark/parse_cdata.yaml b/benchmark/parse_cdata.yaml new file mode 100644 index 00000000..cde04306 --- /dev/null +++ b/benchmark/parse_cdata.yaml @@ -0,0 +1,50 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + require 'rexml/parsers/sax2parser' + require 'rexml/parsers/pullparser' + require 'rexml/parsers/streamparser' + require 'rexml/streamlistener' + + def build_xml(size) + xml = "\n" + + "Test\n" + + "\n" + end + xml = build_xml(100000) + + class Listener + include REXML::StreamListener + end + +benchmark: + 'dom' : REXML::Document.new(xml) + 'sax' : REXML::Parsers::SAX2Parser.new(xml).parse + 'pull' : | + parser = REXML::Parsers::PullParser.new(xml) + while parser.has_next? + parser.pull + end + 'stream' : REXML::Parsers::StreamParser.new(xml, Listener.new).parse diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 44aacfa2..e666c2af 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -471,9 +471,13 @@ def pull_event end return [ :comment, md[1] ] - else - md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) - return [ :cdata, md[1] ] if md + elsif @source.match?("[CDATA[", true) + text = @source.read_until("]]>") + if text.chomp!("]]>") + return [ :cdata, text ] + else + raise REXML::ParseException.new("Malformed CDATA: Missing end ']]>'", @source) + end end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 5ba5ab12..3ec1141e 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -67,7 +67,7 @@ class Source module Private SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} - pre_defined_terms = ["'", '"', "<"] + pre_defined_terms = ["'", '"', "<", "]]>"] if StringScanner::Version < "3.1.1" pre_defined_terms.each do |term| PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb index b5f1a3bc..c742d6a1 100644 --- a/test/parse/test_cdata.rb +++ b/test/parse/test_cdata.rb @@ -7,10 +7,28 @@ module REXMLTests class TestParseCData < Test::Unit::TestCase include Test::Unit::CoreAssertions + def parse(xml) + REXML::Document.new(xml) + end + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('" * n + ' ]]>') + parse('" * n + ' ]]>') + end + end + + class TestInvalid < self + def test_unclosed_cdata + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed CDATA: Missing end ']]>' + Line: 1 + Position: 25 + Last 80 unconsumed characters: + DETAIL end end end From 434909171ef3756c1ca2b84f5c90923a72c6a591 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 3 Mar 2025 13:47:31 +0900 Subject: [PATCH 11/34] Improve comment parse performance (#245) ## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/parse_comment.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) top_level 999.440 5.058k 922.416 3.340k i/s - 100.000 times in 0.100056s 0.019770s 0.108411s 0.029936s in_doctype 1.063k 4.890k 980.498 3.341k i/s - 100.000 times in 0.094116s 0.020449s 0.101989s 0.029927s after_doctype 638.321 1.304k 603.952 1.153k i/s - 100.000 times in 0.156661s 0.076710s 0.165576s 0.086748s Comparison: top_level master: 5058.2 i/s master(YJIT): 3340.5 i/s - 1.51x slower rexml 3.4.1: 999.4 i/s - 5.06x slower 3.4.1(YJIT): 922.4 i/s - 5.48x slower in_doctype master: 4890.2 i/s master(YJIT): 3341.5 i/s - 1.46x slower rexml 3.4.1: 1062.5 i/s - 4.60x slower 3.4.1(YJIT): 980.5 i/s - 4.99x slower after_doctype master: 1303.6 i/s master(YJIT): 1152.8 i/s - 1.13x slower rexml 3.4.1: 638.3 i/s - 2.04x slower 3.4.1(YJIT): 604.0 i/s - 2.16x slower ``` - YJIT=ON : 1.90x - 3.62x faster - YJIT=OFF : 2.04x - 5.06x faster --- benchmark/parse_comment.yaml | 36 ++++++++++++++++++++++++++++++ lib/rexml/parsers/baseparser.rb | 39 ++++++++++++++------------------- test/parse/test_comment.rb | 21 +++++++++++++----- 3 files changed, 69 insertions(+), 27 deletions(-) create mode 100644 benchmark/parse_comment.yaml diff --git a/benchmark/parse_comment.yaml b/benchmark/parse_comment.yaml new file mode 100644 index 00000000..a0a3a771 --- /dev/null +++ b/benchmark/parse_comment.yaml @@ -0,0 +1,36 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + SIZE = 100000 + + top_level_xml = "\n" + in_doctype_xml = "]>" + after_doctype_xml = "" + +benchmark: + 'top_level' : REXML::Document.new(top_level_xml) + 'in_doctype' : REXML::Document.new(in_doctype_xml) + 'after_doctype' : REXML::Document.new(after_doctype_xml) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index e666c2af..61d38ae2 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -277,14 +277,7 @@ def pull_event return process_instruction elsif @source.match?("/um, true) - if md.nil? - raise REXML::ParseException.new("Unclosed comment", @source) - end - if /--|-\z/.match?(md[1]) - raise REXML::ParseException.new("Malformed comment", @source) - end - return [ :comment, md[1] ] + return [ :comment, process_comment ] elsif @source.match?("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" unless @source.match?(/\s+/um, true) @@ -417,12 +410,8 @@ def pull_event raise REXML::ParseException.new(message, @source) end return [:notationdecl, name, *id] - elsif md = @source.match(/--(.*?)-->/um, true) - case md[1] - when /--/, /-\z/ - raise REXML::ParseException.new("Malformed comment", @source) - end - return [ :comment, md[1] ] if md + elsif @source.match?("--", true) + return [ :comment, process_comment ] end elsif match = @source.match(/(%.*?;)\s*/um, true) return [ :externalentity, match[1] ] @@ -463,14 +452,8 @@ def pull_event md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][0] == ?- - md = @source.match(/--(.*?)-->/um, true) - - if md.nil? || /--|-\z/.match?(md[1]) - raise REXML::ParseException.new("Malformed comment", @source) - end - - return [ :comment, md[1] ] + if @source.match?("--", true) + return [ :comment, process_comment ] elsif @source.match?("[CDATA[", true) text = @source.read_until("]]>") if text.chomp!("]]>") @@ -738,6 +721,18 @@ def parse_id_invalid_details(accept_external_id:, end end + def process_comment + text = @source.read_until("-->") + unless text.chomp!("-->") + raise REXML::ParseException.new("Unclosed comment: Missing end '-->'", @source) + end + + if text.include? "--" or text.end_with?("-") + raise REXML::ParseException.new("Malformed comment", @source) + end + text + end + def process_instruction name = parse_name("Malformed XML: Invalid processing instruction node") if @source.match?(/\s+/um, true) diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 4475dca7..c573e711 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -17,7 +17,7 @@ def test_toplevel_unclosed_comment parse("' Line: 1 Position: 4 Last 80 unconsumed characters: @@ -48,6 +48,18 @@ def test_toplevel_malformed_comment_end DETAIL end + def test_doctype_unclosed_comment + exception = assert_raise(REXML::ParseException) do + parse("' + Line: 1 + Position: 19 + Last 80 unconsumed characters: + DETAIL + end + def test_doctype_malformed_comment_inner exception = assert_raise(REXML::ParseException) do parse("") @@ -72,16 +84,15 @@ def test_doctype_malformed_comment_end DETAIL end - def test_after_doctype_malformed_comment_short + def test_after_doctype_unclosed_comment exception = assert_raise(REXML::ParseException) do parse("") end - assert_equal(<<~DETAIL.chomp, exception.to_s) - Malformed comment + assert_equal(<<~DETAIL, exception.to_s) + Unclosed comment: Missing end '-->' Line: 1 Position: 8 Last 80 unconsumed characters: - --> DETAIL end From a5f31c49be106011c4d96cb0e308ebbba118d192 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 5 Mar 2025 06:20:42 +0900 Subject: [PATCH 12/34] Improve CDATA and comment parse performance (#246) ## Why? Since `` are malformed node, they do not need to be checked before comments and CDATA. ## Benchmark : comment (after_doctype) ``` $ benchmark-driver benchmark/parse_comment.yaml Calculating ------------------------------------- before after before(YJIT) after(YJIT) after_doctype 1.306k 5.586k 1.152k 3.569k i/s - 100.000 times in 0.076563s 0.017903s 0.086822s 0.028020s Comparison: after_doctype after: 5585.7 i/s after(YJIT): 3568.9 i/s - 1.57x slower before: 1306.1 i/s - 4.28x slower before(YJIT): 1151.8 i/s - 4.85x slower ``` - YJIT=ON : 3.09x faster - YJIT=OFF : 4.28x faster ## Benchmark : CDATA ``` $ benchmark-driver benchmark/parse_cdata.yaml Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 1.269k 5.548k 1.053k 3.072k i/s - 100.000 times in 0.078808s 0.018026s 0.094976s 0.032553s sax 1.399k 8.244k 1.220k 4.460k i/s - 100.000 times in 0.071458s 0.012130s 0.081958s 0.022422s pull 1.411k 8.319k 1.260k 4.806k i/s - 100.000 times in 0.070883s 0.012021s 0.079335s 0.020809s stream 1.420k 8.320k 1.254k 4.728k i/s - 100.000 times in 0.070406s 0.012019s 0.079738s 0.021149s Comparison: dom after: 5547.5 i/s after(YJIT): 3071.9 i/s - 1.81x slower before: 1268.9 i/s - 4.37x slower before(YJIT): 1052.9 i/s - 5.27x slower sax after: 8244.0 i/s after(YJIT): 4459.9 i/s - 1.85x slower before: 1399.4 i/s - 5.89x slower before(YJIT): 1220.1 i/s - 6.76x slower pull after: 8318.8 i/s after(YJIT): 4805.6 i/s - 1.73x slower before: 1410.8 i/s - 5.90x slower before(YJIT): 1260.5 i/s - 6.60x slower stream after: 8320.2 i/s after(YJIT): 4728.4 i/s - 1.76x slower before: 1420.3 i/s - 5.86x slower before(YJIT): 1254.1 i/s - 6.63x slower ``` - YJIT=ON : 2.91x - 3.80x faster - YJIT=OFF : 4.37x - 5.90x faster Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 6 ++---- test/parse/test_comment.rb | 13 +++++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 61d38ae2..de85aebd 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -449,9 +449,7 @@ def pull_event end return [ :end_element, last_tag ] elsif @source.match?("!", true) - md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" - raise REXML::ParseException.new("Malformed node", @source) unless md if @source.match?("--", true) return [ :comment, process_comment ] elsif @source.match?("[CDATA[", true) @@ -461,9 +459,9 @@ def pull_event else raise REXML::ParseException.new("Malformed CDATA: Missing end ']]>'", @source) end + else + raise REXML::ParseException.new("Malformed node: Started with '") From a85203e88c8f50f64140fb50492cf9dbe3d79301 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 5 Mar 2025 09:45:19 +0900 Subject: [PATCH 13/34] Raise appropriate exception when failing to match start tag in DOCTYPE (#247) ## Why? Added exception to make the process easier to understand. --- lib/rexml/parsers/baseparser.rb | 5 +++-- test/parse/test_comment.rb | 13 +++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index de85aebd..750b1697 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -412,14 +412,15 @@ def pull_event return [:notationdecl, name, *id] elsif @source.match?("--", true) return [ :comment, process_comment ] + else + raise REXML::ParseException.new("Malformed node: Started with '/um, true) @document_status = :after_doctype return [ :end_doctype ] - end - if @document_status == :in_doctype + else raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source) end end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 5349c18e..6339835d 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -48,6 +48,19 @@ def test_toplevel_malformed_comment_end DETAIL end + def test_doctype_malformed_node + exception = assert_raise(REXML::ParseException) do + parse(" Date: Thu, 3 Apr 2025 03:45:35 -0400 Subject: [PATCH 14/34] Fix docs typo in code example (#248) --- lib/rexml/document.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index d1747dd4..1960012c 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -309,8 +309,8 @@ def stand_alone? end # :call-seq: - # doc.write(output=$stdout, indent=-1, transtive=false, ie_hack=false, encoding=nil) - # doc.write(options={:output => $stdout, :indent => -1, :transtive => false, :ie_hack => false, :encoding => nil}) + # doc.write(output=$stdout, indent=-1, transitive=false, ie_hack=false, encoding=nil) + # doc.write(options={:output => $stdout, :indent => -1, :transitive => false, :ie_hack => false, :encoding => nil}) # # Write the XML tree out, optionally with indent. This writes out the # entire XML document, including XML declarations, doctype declarations, From d944fa478a972febe9c3ad2cf35232223d391597 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 3 May 2025 09:03:12 +0900 Subject: [PATCH 15/34] NEWS.md : Fix the mentioned of the PR in CVE-2024-35176. (#253) I think the mentioned of CVE-2024-35176 in NEWS.md is incorrect. ``` - Improved parse performance when an attribute has many ' characters. --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 51a45cab..7f95d829 100644 --- a/NEWS.md +++ b/NEWS.md @@ -386,7 +386,7 @@ * Patch by NAITOH Jun. - * Improved parse performance when an attribute has many `<`s. + * Improved parse performance when an attribute has many `>`s. * GH-126 From de6f40ed8749dd6ab4b7c4b80494a824f7f9027a Mon Sep 17 00:00:00 2001 From: tomoya ishida Date: Sat, 3 May 2025 09:21:27 +0900 Subject: [PATCH 16/34] Fix reverse sort in xpath_parser (#251) The code below was failing with `REXML::XPathParser#sort': undefined method '-@' for an instance of Array` ```ruby d = REXML::Document.new("") matches = REXML::XPath.match(d, "a/b/x/preceding-sibling::node()") # Before: error # After: [, , ] ``` This pull request will fix it. --- lib/rexml/xpath_parser.rb | 2 +- test/xpath/test_base.rb | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 5eb1e5a9..f86a87e6 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -671,7 +671,7 @@ def sort(array_of_nodes, order) if order == :forward index else - -index + index.map(&:-@) end end ordered.collect do |_index, node| diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 1dacd69d..53264a9e 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -416,6 +416,12 @@ def test_preceding assert_equal( 4, cs.length ) end + def test_preceding_sibling + d = REXML::Document.new("") + matches = REXML::XPath.match(d, "a/b/x/preceding-sibling::node()") + assert_equal(["e", "d", "c"], matches.map(&:name)) + end + def test_following d = Document.new "" start = XPath.first( d, "/a/b[@id='0']" ) From 249d770b4ead129abf475708e84e3f1f7908962a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 6 May 2025 21:33:00 +0900 Subject: [PATCH 17/34] Fix duplicate responses in XPath following, following-sibling, preceding, preceding-sibling (#255) ## Why? See: https://github.com/ruby/rexml/pull/251#issuecomment-2845103143 ## Expected values - XPath : a/d/preceding::* => ["d", "c", "b"] ```xml ``` - XPath : a/d/following::* => ["d", "e", "f"] ```xml ``` - XPath : a/b/x/following-sibling:* => ["c", "d", "e"] ```xml ``` - XPath : a/b/x/following-sibling:* => ["c", "d", "x", "e"] ```xml ``` - XPath : a/b/x/preceding-sibling::* => ["e", "d", "c"] ```xml ``` - XPath : a/b/x/preceding-sibling::* => ["e", "x", "d", "c"] ```xml ``` - XPath : //a/following-sibling:*[1] => ["w", "x", "y", "z"] ```xml ``` --- lib/rexml/xpath_parser.rb | 2 +- test/xpath/test_base.rb | 97 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 95 insertions(+), 4 deletions(-) diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index f86a87e6..cde2e5d5 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -144,7 +144,7 @@ def match(path_stack, nodeset) result = expr(path_stack, nodeset) case result when Array # nodeset - unnode(result) + unnode(result).uniq else [result] end diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 53264a9e..b923eed2 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -416,12 +416,103 @@ def test_preceding assert_equal( 4, cs.length ) end - def test_preceding_sibling - d = REXML::Document.new("") - matches = REXML::XPath.match(d, "a/b/x/preceding-sibling::node()") + def test_preceding_multiple + source = <<-XML + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/d/preceding::*") + assert_equal(["d", "c", "b"], matches.map(&:name)) + end + + def test_following_multiple + source = <<-XML + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/d/following::*") + assert_equal(["d", "e", "f"], matches.map(&:name)) + end + + def test_following_sibling_across_multiple_nodes + source = <<-XML + + + + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/following-sibling::*") + assert_equal(["c", "d", "e"], matches.map(&:name)) + end + + def test_following_sibling_within_single_node + source = <<-XML + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/following-sibling::*") + assert_equal(["c", "d", "x", "e"], matches.map(&:name)) + end + + def test_following_sibling_predicates + source = <<-XML + + XML + doc = REXML::Document.new(source) + # Finds a node flowing + matches = REXML::XPath.match(doc, "//a/following-sibling::*[1]") + assert_equal(["w", "x", "y", "z"], matches.map(&:name)) + end + + def test_preceding_sibling_across_multiple_nodes + source = <<-XML + + + + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/preceding-sibling::*") assert_equal(["e", "d", "c"], matches.map(&:name)) end + def test_preceding_sibling_within_single_node + source = <<-XML + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/preceding-sibling::*") + assert_equal(["e", "x", "d", "c"], matches.map(&:name)) + end + def test_following d = Document.new "" start = XPath.first( d, "/a/b[@id='0']" ) From cd575a10cac58eb47f235ed186060ac65ffb5284 Mon Sep 17 00:00:00 2001 From: tomoya ishida Date: Wed, 7 May 2025 21:02:31 +0900 Subject: [PATCH 18/34] Deprecate accepting array as an element in XPath.match, first and each (#252) `XPath.match`, `XPath.first`, `XPath.each`, `XPathParser#parse` and `XPathParser#match` accepted nodeset as element. This pull request changes the first parameter of these method to be an element instead of nodeset. Passing nodeset will be deprecated. ```ruby # Documented usage. OK REXML::XPath.match(element, xpath) # Undocumented usage. Deprecate in this pull request nodeset = [element] REXML::XPath.match(nodeset, xpath) ``` ### Background #249 will introduce a temporary cache. ```ruby def parse path, nodeset path_stack = @parser.parse( path ) nodeset.first.document.send(:enable_cache) do match( path_stack, nodeset ) end end ``` But the signature `XPathParser#match(path, nodeset)` does not guarantee that all nodes in the nodeset has the same root document. So cache does not work in the code below. It's still slow. ```ruby REXML::XPath.match(2.times.map { REXML::Document.new(''*400+''*400) }, 'a//a') ``` The interface is holding our back, so I propose to drop accepting array as element. This change is a backward incompatibility, but it just drops undocumented feature. I think only the test code was unintentionally using this feature. ### XPath.match with array XPath.match only traverse the first element of the array for some selectors. ```ruby nodeset = [REXML::Document.new(""), REXML::Document.new("")] REXML::XPath.match(nodeset, "a/*") #=> [, ] REXML::XPath.match(nodeset, "//a/*") #=> [] # I expect [, ] but the second document is ignored ``` It indicates that `XPath.match` is not designed to search inside multiple nodes/documents. --------- Co-authored-by: Sutou Kouhei --- lib/rexml/xpath.rb | 3 --- lib/rexml/xpath_parser.rb | 22 ++++++++++++---------- test/test_jaxen.rb | 16 ++++++++++------ test/xpath/test_base.rb | 17 ++++++++++++++--- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/lib/rexml/xpath.rb b/lib/rexml/xpath.rb index a0921bd8..666d764f 100644 --- a/lib/rexml/xpath.rb +++ b/lib/rexml/xpath.rb @@ -35,7 +35,6 @@ def XPath::first(element, path=nil, namespaces=nil, variables={}, options={}) parser.namespaces = namespaces parser.variables = variables path = "*" unless path - element = [element] unless element.kind_of? Array parser.parse(path, element).flatten[0] end @@ -64,7 +63,6 @@ def XPath::each(element, path=nil, namespaces=nil, variables={}, options={}, &bl parser.namespaces = namespaces parser.variables = variables path = "*" unless path - element = [element] unless element.kind_of? Array parser.parse(path, element).each( &block ) end @@ -74,7 +72,6 @@ def XPath::match(element, path=nil, namespaces=nil, variables={}, options={}) parser.namespaces = namespaces parser.variables = variables path = "*" unless path - element = [element] unless element.kind_of? Array parser.parse(path,element) end end diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index cde2e5d5..8440015b 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -76,19 +76,19 @@ def variables=( vars={} ) @variables = vars end - def parse path, nodeset + def parse path, node path_stack = @parser.parse( path ) - match( path_stack, nodeset ) + match( path_stack, node ) end - def get_first path, nodeset + def get_first path, node path_stack = @parser.parse( path ) - first( path_stack, nodeset ) + first( path_stack, node ) end - def predicate path, nodeset + def predicate path, node path_stack = @parser.parse( path ) - match( path_stack, nodeset ) + match( path_stack, node ) end def []=( variable_name, value ) @@ -136,11 +136,13 @@ def first( path_stack, node ) end - def match(path_stack, nodeset) - nodeset = nodeset.collect.with_index do |node, i| - position = i + 1 - XPathNode.new(node, position: position) + def match(path_stack, node) + if node.is_a?(Array) + Kernel.warn("REXML::XPath.each, REXML::XPath.first, REXML::XPath.match dropped support for nodeset...", uplevel: 1) + return [] if node.empty? + node = node.first end + nodeset = [XPathNode.new(node, position: 1)] result = expr(path_stack, nodeset) case result when Array # nodeset diff --git a/test/test_jaxen.rb b/test/test_jaxen.rb index 6038e88e..548120d6 100644 --- a/test/test_jaxen.rb +++ b/test/test_jaxen.rb @@ -56,7 +56,9 @@ def process_test_case(name) # processes a tests/document/context node def process_context(doc, context) - test_context = XPath.match(doc, context.attributes["select"]) + matched = XPath.match(doc, context.attributes["select"]) + assert_equal(1, matched.size) + test_context = matched.first namespaces = context.namespaces namespaces.delete("var") namespaces = nil if namespaces.empty? @@ -101,10 +103,14 @@ def process_nominal_test(context, variables, namespaces, test) assert_equal(Integer(expected, 10), matched.size, user_message(context, xpath, matched)) + else + assert_operator(matched.size, :>, 0, user_message(context, xpath, matched)) end XPath.each(test, "valueOf") do |value_of| - process_value_of(matched, variables, namespaces, value_of) + matched.each do |subcontext| + process_value_of(subcontext, variables, namespaces, value_of) + end end end @@ -118,10 +124,8 @@ def process_exceptional_test(context, variables, namespaces, test) def user_message(context, xpath, matched) message = "" - context.each_with_index do |node, i| - message << "Node#{i}:\n" - message << "#{node}\n" - end + message << "Node:\n" + message << "#{context}\n" message << "XPath: <#{xpath}>\n" message << "Matched <#{matched}>" message diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index b923eed2..ab22f6f9 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -411,9 +411,10 @@ def test_preceding s = "" d = REXML::Document.new(s) - c = REXML::XPath.match( d, "//c[@id = '5']") - cs = REXML::XPath.match( c, "preceding::c" ) - assert_equal( 4, cs.length ) + c = REXML::XPath.match(d, "//c[@id = '5']") + assert_equal(1, c.length) + cs = REXML::XPath.match(c.first, "preceding::c") + assert_equal(4, cs.length) end def test_preceding_multiple @@ -1255,5 +1256,15 @@ def test_or_and end assert_equal(["/"], hrefs, "Bug #3842 [ruby-core:32447]") end + + def test_match_with_deprecated_usage + verbose, $VERBOSE = $VERBOSE, nil + doc = Document.new("") + assert_equal(['b'], XPath.match([doc, doc], '//b').map(&:name)) + assert_equal(['b'], XPath.match([doc], '//b').map(&:name)) + assert_equal([], XPath.match([], '//b').map(&:name)) + ensure + $VERBOSE = verbose + end end end From e80ffdd12713cd138dbe33f26968452dc33d20df Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 12 May 2025 10:22:11 +0900 Subject: [PATCH 19/34] Improve using `//` in XPath performance (#249) When using `//` in XPath, the deeper the tag hierarchy, the slower it becomes due to the namespace acquisition process. Caching namespace information improves performance when using `//` with XPath. ## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/xpath.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) REXML::XPath.match(REXML::Document.new(xml), 'a//a') 29.215 234.909 108.945 492.410 i/s - 100.000 times in 3.422925s 0.425697s 0.917898s 0.203083s Comparison: REXML::XPath.match(REXML::Document.new(xml), 'a//a') master(YJIT): 492.4 i/s master: 234.9 i/s - 2.10x slower 3.4.1(YJIT): 108.9 i/s - 4.52x slower rexml 3.4.1: 29.2 i/s - 16.85x slower ``` - YJIT=ON : 4.52x faster - YJIT=OFF : 8.04x faster --------- Co-authored-by: tomoya ishida Co-authored-by: Sutou Kouhei --- benchmark/xpath.yaml | 32 ++++++++++++++++++++++++++++++++ lib/rexml/attribute.rb | 4 ++++ lib/rexml/document.rb | 14 ++++++++++++++ lib/rexml/element.rb | 33 +++++++++++++++++---------------- lib/rexml/xpath_parser.rb | 27 ++++++++++++--------------- test/test_core.rb | 23 +++++++++++++++++------ test/xpath/test_base.rb | 10 ++++++++++ 7 files changed, 106 insertions(+), 37 deletions(-) create mode 100644 benchmark/xpath.yaml diff --git a/benchmark/xpath.yaml b/benchmark/xpath.yaml new file mode 100644 index 00000000..d6e970eb --- /dev/null +++ b/benchmark/xpath.yaml @@ -0,0 +1,32 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + DEPTH = 100 + xml = '' * DEPTH + '' * DEPTH + doc = REXML::Document.new(xml) + +benchmark: + "REXML::XPath.match(REXML::Document.new(xml), 'a//a')" : REXML::XPath.match(doc, "a//a") diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index fe48745c..7a190225 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -206,6 +206,10 @@ def xpath path += "/@#{self.expanded_name}" return path end + + def document + @element&.document + end end end #vim:ts=2 sw=2 noexpandtab: diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 1960012c..1c678bef 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -448,6 +448,20 @@ def document end private + + attr_accessor :namespaces_cache + + # New document level cache is created and available in this block. + # This API is thread unsafe. Users can't change this document in this block. + def enable_cache + @namespaces_cache = {} + begin + yield + ensure + @namespaces_cache = nil + end + end + def build( source ) Parsers::TreeParser.new( source, self ).parse end diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 4e3a60b9..b62b6cc2 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -589,10 +589,12 @@ def prefixes # d.elements['//c'].namespaces # => {"x"=>"1", "y"=>"2", "z"=>"3"} # def namespaces - namespaces = {} - namespaces = parent.namespaces if parent - namespaces = namespaces.merge( attributes.namespaces ) - return namespaces + namespaces_cache = document&.__send__(:namespaces_cache) + if namespaces_cache + namespaces_cache[self] ||= calculate_namespaces + else + calculate_namespaces + end end # :call-seq: @@ -619,17 +621,9 @@ def namespace(prefix=nil) if prefix.nil? prefix = prefix() end - if prefix == '' - prefix = "xmlns" - else - prefix = "xmlns:#{prefix}" unless prefix[0,5] == 'xmlns' - end - ns = nil - target = self - while ns.nil? and target - ns = target.attributes[prefix] - target = target.parent - end + prefix = (prefix == '') ? 'xmlns' : prefix.delete_prefix("xmlns:") + ns = namespaces[prefix] + ns = '' if ns.nil? and prefix == 'xmlns' return ns end @@ -1516,8 +1510,15 @@ def write(output=$stdout, indent=-1, transitive=false, ie_hack=false) formatter.write( self, output ) end - private + def calculate_namespaces + if parent + parent.namespaces.merge(attributes.namespaces) + else + attributes.namespaces + end + end + def __to_xpath_helper node rv = node.expanded_name.clone if node.parent diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 8440015b..70ae8919 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -78,7 +78,15 @@ def variables=( vars={} ) def parse path, node path_stack = @parser.parse( path ) - match( path_stack, node ) + if node.is_a?(Array) + Kernel.warn("REXML::XPath.each, REXML::XPath.first, REXML::XPath.match dropped support for nodeset...", uplevel: 1) + return [] if node.empty? + node = node.first + end + + node.document.__send__(:enable_cache) do + match( path_stack, node ) + end end def get_first path, node @@ -137,11 +145,6 @@ def first( path_stack, node ) def match(path_stack, node) - if node.is_a?(Array) - Kernel.warn("REXML::XPath.each, REXML::XPath.first, REXML::XPath.match dropped support for nodeset...", uplevel: 1) - return [] if node.empty? - node = node.first - end nodeset = [XPathNode.new(node, position: 1)] result = expr(path_stack, nodeset) case result @@ -494,14 +497,10 @@ def node_test(path_stack, nodesets, any_type: :element) if strict? raw_node.name == name and raw_node.namespace == "" else - # FIXME: This DOUBLES the time XPath searches take - ns = get_namespace(raw_node, prefix) - raw_node.name == name and raw_node.namespace == ns + raw_node.name == name and raw_node.namespace == get_namespace(raw_node, prefix) end else - # FIXME: This DOUBLES the time XPath searches take - ns = get_namespace(raw_node, prefix) - raw_node.name == name and raw_node.namespace == ns + raw_node.name == name and raw_node.namespace == get_namespace(raw_node, prefix) end when :attribute if prefix.nil? @@ -509,9 +508,7 @@ def node_test(path_stack, nodesets, any_type: :element) elsif prefix.empty? raw_node.name == name and raw_node.namespace == "" else - # FIXME: This DOUBLES the time XPath searches take - ns = get_namespace(raw_node.element, prefix) - raw_node.name == name and raw_node.namespace == ns + raw_node.name == name and raw_node.namespace == get_namespace(raw_node.element, prefix) end else false diff --git a/test/test_core.rb b/test/test_core.rb index 34fe9e07..651056f2 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -653,18 +653,23 @@ def test_namespace assert_equal "Some text", out end - def test_add_namespace e = Element.new 'a' + assert_equal("", e.namespace) + assert_nil(e.namespace('foo')) e.add_namespace 'someuri' e.add_namespace 'foo', 'otheruri' e.add_namespace 'xmlns:bar', 'thirduri' - assert_equal 'someuri', e.attributes['xmlns'] - assert_equal 'otheruri', e.attributes['xmlns:foo'] - assert_equal 'thirduri', e.attributes['xmlns:bar'] + assert_equal("someuri", e.namespace) + assert_equal("otheruri", e.namespace('foo')) + assert_equal("otheruri", e.namespace('xmlns:foo')) + assert_equal("thirduri", e.namespace('bar')) + assert_equal("thirduri", e.namespace('xmlns:bar')) + assert_equal('someuri', e.attributes['xmlns']) + assert_equal('otheruri', e.attributes['xmlns:foo']) + assert_equal('thirduri', e.attributes['xmlns:bar']) end - def test_big_documentation d = File.open(fixture_path("documentation.xml")) {|f| Document.new f } assert_equal "Sean Russell", d.elements["documentation/head/author"].text.tr("\n\t", " ").squeeze(" ") @@ -764,9 +769,15 @@ def test_attributes_each def test_delete_namespace doc = Document.new "" + assert_equal("1", doc.root.namespace) + assert_equal("2", doc.root.namespace('x')) + assert_equal("2", doc.root.namespace('xmlns:x')) doc.root.delete_namespace doc.root.delete_namespace 'x' - assert_equal "", doc.to_s + assert_equal("", doc.to_s) + assert_equal("", doc.root.namespace) + assert_nil(doc.root.namespace('x')) + assert_nil(doc.root.namespace('xmlns:x')) end def test_each_element_with_attribute diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index ab22f6f9..764171ab 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -1193,6 +1193,16 @@ def test_namespaces_0 assert_equal( 1, XPath.match( d, "//x:*" ).size ) end + def test_namespaces_cache + doc = Document.new("") + assert_equal("", XPath.first(doc, "//b[namespace-uri()='1']").to_s) + assert_nil(XPath.first(doc, "//b[namespace-uri()='']")) + + doc.root.delete_namespace + assert_nil(XPath.first(doc, "//b[namespace-uri()='1']")) + assert_equal("", XPath.first(doc, "//b[namespace-uri()='']").to_s) + end + def test_ticket_71 doc = Document.new(%Q{}) el = doc.root.elements[1] From 3dc9eca877f8444b7ac1d6008feb724cbfdc239a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 29 May 2025 10:14:32 +0900 Subject: [PATCH 20/34] Improve `Text.check` performance (#256) The doctype parameter of Text.check is not being used. Changing the doctype parameter to an optional parameter improves the parsing speed of the DOM. ## Benchmark ``` before after before(YJIT) after(YJIT) dom 19.854 23.805 33.969 37.712 i/s - 100.000 times in 5.036779s 4.200839s 2.943877s 2.651709s sax 29.436 30.494 54.070 55.089 i/s - 100.000 times in 3.397155s 3.279348s 1.849463s 1.815255s pull 34.908 34.857 62.969 64.895 i/s - 100.000 times in 2.864651s 2.868842s 1.588082s 1.540939s stream 34.570 34.281 60.616 60.355 i/s - 100.000 times in 2.892656s 2.917080s 1.649737s 1.656866s Comparison: dom after(YJIT): 37.7 i/s before(YJIT): 34.0 i/s - 1.11x slower after: 23.8 i/s - 1.58x slower before: 19.9 i/s - 1.90x slower sax after(YJIT): 55.1 i/s before(YJIT): 54.1 i/s - 1.02x slower after: 30.5 i/s - 1.81x slower before: 29.4 i/s - 1.87x slower pull after(YJIT): 64.9 i/s before(YJIT): 63.0 i/s - 1.03x slower before: 34.9 i/s - 1.86x slower after: 34.9 i/s - 1.86x slower stream before(YJIT): 60.6 i/s after(YJIT): 60.4 i/s - 1.00x slower before: 34.6 i/s - 1.75x slower after: 34.3 i/s - 1.77x slower ``` - YJIT=ON : 1.00x - 1.11x faster (dom: 1.11x faster) - YJIT=OFF : 1.00x - 1.20x faster (dom: 1.20x faster) --- lib/rexml/attribute.rb | 2 +- lib/rexml/text.rb | 6 +++--- test/test_text_check.rb | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 7a190225..ba49207c 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -173,7 +173,7 @@ def element=( element ) @element = element if @normalized - Text.check( @normalized, NEEDS_A_SECOND_CHECK, doctype ) + Text.check( @normalized, NEEDS_A_SECOND_CHECK ) end self diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 2bf480fb..6f821472 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -104,16 +104,16 @@ def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, @entity_filter = entity_filter if entity_filter clear_cache - Text.check(@string, illegal, doctype) if @raw + Text.check(@string, illegal) if @raw end def parent= parent super(parent) - Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent + Text.check(@string, NEEDS_A_SECOND_CHECK) if @raw and @parent end # check for illegal characters - def Text.check string, pattern, doctype + def Text.check string, pattern, doctype = nil # illegal anywhere if !string.match?(VALID_XML_CHARS) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index 11cf65a3..3f2f7864 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -4,7 +4,7 @@ module REXMLTests class TextCheckTester < Test::Unit::TestCase def check(string) - REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK, nil) + REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK) end def assert_check(string) From 95b8ef8d8549eb98763477e6e5307bf97c1dc4c5 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 7 Jul 2025 14:16:56 +0900 Subject: [PATCH 21/34] Fix wrong Encoding resolution (#258) In this context, `Encoding` means `REXML::Encoding` not `Encoding`. --- lib/rexml/encoding.rb | 2 +- test/test_source.rb | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 test/test_source.rb diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index da2d70d6..f8459316 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -5,7 +5,7 @@ module Encoding # ID ---> Encoding name attr_reader :encoding def encoding=(encoding) - encoding = encoding.name if encoding.is_a?(Encoding) + encoding = encoding.name if encoding.is_a?(::Encoding) if encoding.is_a?(String) original_encoding = encoding encoding = find_encoding(encoding) diff --git a/test/test_source.rb b/test/test_source.rb new file mode 100644 index 00000000..b309105a --- /dev/null +++ b/test/test_source.rb @@ -0,0 +1,21 @@ +require "rexml/source" + +module REXMLTests + class TestSource < Test::Unit::TestCase + def setup + @source = REXML::Source.new(+"") + end + + sub_test_case("#encoding=") do + test("String") do + @source.encoding = "UTF-8" + assert_equal("UTF-8", @source.encoding) + end + + test("Encoding") do + @source.encoding = Encoding::UTF_8 + assert_equal("UTF-8", @source.encoding) + end + end + end +end From 548172637b8eb106ea38f3b91f54d0fc2e6e8e08 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 9 Jul 2025 06:14:08 +0900 Subject: [PATCH 22/34] Don't call needless encoding_updated (#259) Needless encoding_updated call may have performance penalty a bit. --- lib/rexml/encoding.rb | 7 ++----- test/test_source.rb | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index f8459316..7eb05f4d 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -13,12 +13,9 @@ def encoding=(encoding) raise ArgumentError, "Bad encoding name #{original_encoding}" end end + encoding = encoding.upcase if encoding return false if defined?(@encoding) and encoding == @encoding - if encoding - @encoding = encoding.upcase - else - @encoding = 'UTF-8' - end + @encoding = encoding || "UTF-8" true end diff --git a/test/test_source.rb b/test/test_source.rb index b309105a..86755f37 100644 --- a/test/test_source.rb +++ b/test/test_source.rb @@ -12,6 +12,21 @@ def setup assert_equal("UTF-8", @source.encoding) end + test("encoding_updated") do + def @source.n_encoding_updated_called + @n_encoding_updated_called + end + def @source.encoding_updated + super + @n_encoding_updated_called ||= 0 + @n_encoding_updated_called += 1 + end + @source.encoding = "shift-jis" + assert_equal(1, @source.n_encoding_updated_called) + @source.encoding = "Shift-JIS" + assert_equal(1, @source.n_encoding_updated_called) + end + test("Encoding") do @source.encoding = Encoding::UTF_8 assert_equal("UTF-8", @source.encoding) From ec410a0d5e5e5daddca82fd1455824219403f676 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 07:06:45 +0700 Subject: [PATCH 23/34] Reuse XPath.match (#263) `XPath.each` and `XPath.first` can reuse `XPath.match`. --- lib/rexml/xpath.rb | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/lib/rexml/xpath.rb b/lib/rexml/xpath.rb index 666d764f..eed0300c 100644 --- a/lib/rexml/xpath.rb +++ b/lib/rexml/xpath.rb @@ -31,11 +31,7 @@ class XPath def XPath::first(element, path=nil, namespaces=nil, variables={}, options={}) raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.nil? or namespaces.kind_of?(Hash) raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of?(Hash) - parser = XPathParser.new(**options) - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - parser.parse(path, element).flatten[0] + match(element, path, namespaces, variables, options).flatten[0] end # Iterates over nodes that match the given path, calling the supplied @@ -59,11 +55,7 @@ def XPath::first(element, path=nil, namespaces=nil, variables={}, options={}) def XPath::each(element, path=nil, namespaces=nil, variables={}, options={}, &block) raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.nil? or namespaces.kind_of?(Hash) raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of?(Hash) - parser = XPathParser.new(**options) - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - parser.parse(path, element).each( &block ) + match(element, path, namespaces, variables, options).each( &block ) end # Returns an array of nodes matching a given XPath. From 2271fd374403bcdfb0b9f288cc0d97c92af9d886 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 07:18:20 +0700 Subject: [PATCH 24/34] docs: Use # to reference instance methods (#270) Fixes #269 We should use `XXX#method` not `XXX.method` to reference instance methods. --- lib/rexml/cdata.rb | 2 +- lib/rexml/comment.rb | 2 +- lib/rexml/element.rb | 2 +- lib/rexml/instruction.rb | 2 +- lib/rexml/node.rb | 2 +- lib/rexml/text.rb | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/rexml/cdata.rb b/lib/rexml/cdata.rb index 997f5a08..264ad642 100644 --- a/lib/rexml/cdata.rb +++ b/lib/rexml/cdata.rb @@ -58,7 +58,7 @@ def value # c = CData.new( " Some text " ) # c.write( $stdout ) #-> def write( output=$stdout, indent=-1, transitive=false, ie_hack=false ) - Kernel.warn( "#{self.class.name}.write is deprecated", uplevel: 1) + Kernel.warn( "#{self.class.name}#write is deprecated", uplevel: 1) indent( output, indent ) output << START output << @string diff --git a/lib/rexml/comment.rb b/lib/rexml/comment.rb index 52c58b46..e7e104d4 100644 --- a/lib/rexml/comment.rb +++ b/lib/rexml/comment.rb @@ -48,7 +48,7 @@ def clone # ie_hack:: # Needed for conformity to the child API, but not used by this class. def write( output, indent=-1, transitive=false, ie_hack=false ) - Kernel.warn("Comment.write is deprecated. See REXML::Formatters", uplevel: 1) + Kernel.warn("#{self.class.name}#write is deprecated. See REXML::Formatters", uplevel: 1) indent( output, indent ) output << START output << @string diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index b62b6cc2..4311d58f 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -1496,7 +1496,7 @@ def texts # doc.write( out ) #-> doc is written to the string 'out' # doc.write( $stdout ) #-> doc written to the console def write(output=$stdout, indent=-1, transitive=false, ie_hack=false) - Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters", uplevel: 1) + Kernel.warn("#{self.class.name}#write is deprecated. See REXML::Formatters", uplevel: 1) formatter = if indent > -1 if transitive require_relative "formatters/transitive" diff --git a/lib/rexml/instruction.rb b/lib/rexml/instruction.rb index 318741f0..a3dfbbec 100644 --- a/lib/rexml/instruction.rb +++ b/lib/rexml/instruction.rb @@ -49,7 +49,7 @@ def clone # See the rexml/formatters package # def write writer, indent=-1, transitive=false, ie_hack=false - Kernel.warn( "#{self.class.name}.write is deprecated", uplevel: 1) + Kernel.warn( "#{self.class.name}#write is deprecated", uplevel: 1) indent(writer, indent) writer << START writer << @target diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index c771db70..033b740d 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -26,7 +26,7 @@ def previous_sibling_node # REXML::Formatters package for changing the output style. def to_s indent=nil unless indent.nil? - Kernel.warn( "#{self.class.name}.to_s(indent) parameter is deprecated", uplevel: 1) + Kernel.warn( "#{self.class.name}#to_s(indent) parameter is deprecated", uplevel: 1) f = REXML::Formatters::Pretty.new( indent ) f.write( self, rv = "" ) else diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 6f821472..e03ce9d1 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -287,7 +287,7 @@ def indent_text(string, level=1, style="\t", indentfirstline=true) # See REXML::Formatters # def write( writer, indent=-1, transitive=false, ie_hack=false ) - Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters", uplevel: 1) + Kernel.warn("#{self.class.name}#write is deprecated. See REXML::Formatters", uplevel: 1) formatter = if indent > -1 REXML::Formatters::Pretty.new( indent ) else From d427fc5914fcc17d7247c5ff9099ee38639d6702 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 07:20:21 +0700 Subject: [PATCH 25/34] Avoid redundant calls for doctype (#264) We can avoid calling `Document#doctype` by keeping `Document#doctype` result in a local variable. --- lib/rexml/element.rb | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 4311d58f..1c580577 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -2325,11 +2325,11 @@ def get_attribute( name ) return attr end end - element_document = @element.document - if element_document and element_document.doctype + doctype = @element.document&.doctype + if doctype expn = @element.expanded_name - expn = element_document.doctype.name if expn.size == 0 - attr_val = element_document.doctype.attribute_of(expn, name) + expn = doctype.name if expn.size == 0 + attr_val = doctype.attribute_of(expn, name) return Attribute.new( name, attr_val ) if attr_val end return nil @@ -2371,8 +2371,9 @@ def []=( name, value ) end unless value.kind_of? Attribute - if @element.document and @element.document.doctype - value = Text::normalize( value, @element.document.doctype ) + doctype = @element.document&.doctype + if doctype + value = Text::normalize( value, doctype ) else value = Text::normalize( value, nil ) end @@ -2409,10 +2410,11 @@ def prefixes each_attribute do |attribute| ns << attribute.name if attribute.prefix == 'xmlns' end - if @element.document and @element.document.doctype + doctype = @element.document&.doctype + if doctype expn = @element.expanded_name - expn = @element.document.doctype.name if expn.size == 0 - @element.document.doctype.attributes_of(expn).each { + expn = doctype.name if expn.size == 0 + doctype.attributes_of(expn).each { |attribute| ns << attribute.name if attribute.prefix == 'xmlns' } @@ -2434,10 +2436,11 @@ def namespaces each_attribute do |attribute| namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' end - if @element.document and @element.document.doctype + doctype = @element.document&.doctype + if doctype expn = @element.expanded_name - expn = @element.document.doctype.name if expn.size == 0 - @element.document.doctype.attributes_of(expn).each { + expn = doctype.name if expn.size == 0 + doctype.attributes_of(expn).each { |attribute| namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' } From 63f3e9772595a64b036953f0ab026d2ea5560a3b Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 07:21:10 +0700 Subject: [PATCH 26/34] Use Safe Navigation (&.) from Ruby 2.3 (#265) We can simplify our code by using `&.`. --- lib/rexml/attribute.rb | 5 +---- lib/rexml/child.rb | 3 +-- lib/rexml/doctype.rb | 11 +++-------- lib/rexml/element.rb | 3 +-- lib/rexml/text.rb | 5 +---- 5 files changed, 7 insertions(+), 20 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index ba49207c..1326563a 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -130,10 +130,7 @@ def to_string end def doctype - if @element - doc = @element.document - doc.doctype if doc - end + @element&.document&.doctype end # Returns the attribute value, with entities replaced diff --git a/lib/rexml/child.rb b/lib/rexml/child.rb index cc6e9a47..40abde87 100644 --- a/lib/rexml/child.rb +++ b/lib/rexml/child.rb @@ -83,8 +83,7 @@ def previous_sibling=(other) # Returns:: the document this child belongs to, or nil if this child # belongs to no document def document - return parent.document unless parent.nil? - nil + parent&.document end # This doesn't yet handle encodings diff --git a/lib/rexml/doctype.rb b/lib/rexml/doctype.rb index f3590484..a9cf9f7e 100644 --- a/lib/rexml/doctype.rb +++ b/lib/rexml/doctype.rb @@ -171,15 +171,11 @@ def write( output, indent=0, transitive=false, ie_hack=false ) end def context - if @parent - @parent.context - else - nil - end + @parent&.context end def entity( name ) - @entities[name].unnormalized if @entities[name] + @entities[name]&.unnormalized end def add child @@ -288,8 +284,7 @@ def initialize name, middle, pub, sys end def to_s - context = nil - context = parent.context if parent + context = parent&.context notation = "( other ) end def doctype - if @parent - doc = @parent.document - doc.doctype if doc - end + @parent&.document&.doctype end REFERENCE = /#{Entity::REFERENCE}/ From 66232eaf680d0937ae59bea285cdb8e4d3d88a93 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 08:12:02 +0700 Subject: [PATCH 27/34] Remove redundant return statements (#266) Very slight behavior change here in `REXML::Valdiation::Event#matches?`, which is to align the predicate method's return value with the expected behavior of a predicate method (which is to return one of true or false). --- lib/rexml/attribute.rb | 4 +-- lib/rexml/document.rb | 4 +-- lib/rexml/element.rb | 47 +++++++++++--------------- lib/rexml/functions.rb | 6 ++-- lib/rexml/namespace.rb | 8 ++--- lib/rexml/node.rb | 2 +- lib/rexml/parsers/baseparser.rb | 7 ++-- lib/rexml/parsers/xpathparser.rb | 8 ++--- lib/rexml/quickpath.rb | 37 +++++++++++---------- lib/rexml/security.rb | 4 +-- lib/rexml/text.rb | 14 ++++---- lib/rexml/validation/relaxng.rb | 53 +++++++++++++++--------------- lib/rexml/validation/validation.rb | 16 ++++----- lib/rexml/xpath_parser.rb | 38 ++++++++++----------- 14 files changed, 118 insertions(+), 130 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 1326563a..c5673249 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -199,9 +199,7 @@ def inspect end def xpath - path = @element.xpath - path += "/@#{self.expanded_name}" - return path + @element.xpath + "/@#{self.expanded_name}" end def document diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 1c678bef..96ae5b75 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -415,7 +415,7 @@ def Document::entity_expansion_limit=( val ) # # Deprecated. Use REXML::Security.entity_expansion_limit= instead. def Document::entity_expansion_limit - return Security.entity_expansion_limit + Security.entity_expansion_limit end # Set the entity expansion limit. By default the limit is set to 10240. @@ -429,7 +429,7 @@ def Document::entity_expansion_text_limit=( val ) # # Deprecated. Use REXML::Security.entity_expansion_text_limit instead. def Document::entity_expansion_text_limit - return Security.entity_expansion_text_limit + Security.entity_expansion_text_limit end attr_reader :entity_expansion_count diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index e9ca684e..0d74811e 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -565,7 +565,7 @@ def prefixes prefixes = [] prefixes = parent.prefixes if parent prefixes |= attributes.prefixes - return prefixes + prefixes end # :call-seq: @@ -624,7 +624,7 @@ def namespace(prefix=nil) ns = namespaces[prefix] ns = '' if ns.nil? and prefix == 'xmlns' - return ns + ns end # :call-seq: @@ -956,7 +956,7 @@ def get_elements( xpath ) def next_element element = next_sibling element = element.next_sibling until element.nil? or element.kind_of? Element - return element + element end # :call-seq: @@ -972,7 +972,7 @@ def next_element def previous_element element = previous_sibling element = element.previous_sibling until element.nil? or element.kind_of? Element - return element + element end @@ -1022,8 +1022,7 @@ def has_text? # def text( path = nil ) rv = get_text(path) - return rv.value unless rv.nil? - nil + rv&.value end # :call-seq: @@ -1051,7 +1050,7 @@ def get_text path = nil else rv = @children.find { |node| node.kind_of? Text } end - return rv + rv end # :call-seq: @@ -1095,7 +1094,7 @@ def text=( text ) old_text.replace_with( text ) end end - return self + self end # :call-seq: @@ -1146,7 +1145,7 @@ def add_text( text ) text = Text.new( text, whitespace(), nil, raw() ) end self << text unless text.nil? - return self + self end # :call-seq: @@ -1190,7 +1189,7 @@ def xpath cur = cur.parent path_elements << __to_xpath_helper( cur ) end - return path_elements.reverse.join( "/" ) + path_elements.reverse.join( "/" ) end ################################################# @@ -1292,7 +1291,6 @@ def attribute( name, namespace=nil ) return nil unless ( namespaces[ prefix ] == namespaces[ 'xmlns' ] ) attributes.get_attribute( name ) - end # :call-seq: @@ -1306,7 +1304,7 @@ def attribute( name, namespace=nil ) # b.has_attributes? # => false # def has_attributes? - return !@attributes.empty? + !@attributes.empty? end # :call-seq: @@ -1684,11 +1682,7 @@ def []( index, name=nil) (num += 1) == index } else - return XPath::first( @element, index ) - #{ |element| - # return element if element.kind_of? Element - #} - #return nil + XPath::first( @element, index ) end end @@ -1735,7 +1729,7 @@ def []=( index, element ) else previous.replace_with element end - return previous + previous end # :call-seq: @@ -1774,7 +1768,7 @@ def index element child == element end return rv if found == element - return -1 + -1 end # :call-seq: @@ -1853,7 +1847,7 @@ def delete_all( xpath ) @element.delete element element.remove end - return rv + rv end # :call-seq: @@ -2180,8 +2174,7 @@ def initialize element # def [](name) attr = get_attribute(name) - return attr.value unless attr.nil? - return nil + attr&.value end # :call-seq: @@ -2336,7 +2329,7 @@ def get_attribute( name ) if attr.kind_of? Hash attr = attr[ @element.prefix ] end - return attr + attr end # :call-seq: @@ -2390,7 +2383,7 @@ def []=( name, value ) else store value.name, value end - return @element + @element end # :call-seq: @@ -2494,9 +2487,7 @@ def delete( attribute ) old.each_value{|v| repl = v} store name, repl end - elsif old.nil? - return @element - else # the supplied attribute is a top-level one + elsif old # the supplied attribute is a top-level one super(name) end @element @@ -2550,7 +2541,7 @@ def delete_all( name ) rv << attribute if attribute.expanded_name == name } rv.each{ |attr| attr.remove } - return rv + rv end # :call-seq: diff --git a/lib/rexml/functions.rb b/lib/rexml/functions.rb index 4c114616..60ae34e7 100644 --- a/lib/rexml/functions.rb +++ b/lib/rexml/functions.rb @@ -39,11 +39,11 @@ def Functions::context=(value); @@context = value; end def Functions::text( ) if @@context[:node].node_type == :element - return @@context[:node].find_all{|n| n.node_type == :text}.collect{|n| n.value} + @@context[:node].find_all{|n| n.node_type == :text}.collect{|n| n.value} elsif @@context[:node].node_type == :text - return @@context[:node].value + @@context[:node].value else - return false + false end end diff --git a/lib/rexml/namespace.rb b/lib/rexml/namespace.rb index 2e67252a..232b7ca4 100644 --- a/lib/rexml/namespace.rb +++ b/lib/rexml/namespace.rb @@ -42,11 +42,11 @@ def name=( name ) # Compares names optionally WITH namespaces def has_name?( other, ns=nil ) if ns - return (namespace() == ns and name() == other) + namespace() == ns and name() == other elsif other.include? ":" - return fully_expanded_name == other + fully_expanded_name == other else - return name == other + name == other end end @@ -57,7 +57,7 @@ def has_name?( other, ns=nil ) def fully_expanded_name ns = prefix return "#{ns}:#@name" if ns.size > 0 - return @name + @name end end end diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index 033b740d..bccacc51 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -68,7 +68,7 @@ def find_first_recursive(&block) # :yields: node each_recursive {|node| return node if block.call(node) } - return nil + nil end # Returns the position that +self+ holds in its parent's array, indexed diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 750b1697..a87657b5 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -206,12 +206,12 @@ def position # Returns true if there are no more events def empty? - return (@source.empty? and @stack.empty?) + (@source.empty? and @stack.empty?) end # Returns true if there are more events. Synonymous with !empty? def has_next? - return !(@source.empty? and @stack.empty?) + !(@source.empty? and @stack.empty?) end # Push an event back on the head of the stream. This method @@ -522,7 +522,8 @@ def pull_event raise REXML::ParseException.new( "Exception parsing", @source, self, (error ? error : $!) ) end - return [ :dummy ] + # NOTE: The end of the method never runs, because it is unreachable. + # All branches of code above have explicit unconditional return or raise statements. end private :pull_event diff --git a/lib/rexml/parsers/xpathparser.rb b/lib/rexml/parsers/xpathparser.rb index bd3b6856..a6d76fdc 100644 --- a/lib/rexml/parsers/xpathparser.rb +++ b/lib/rexml/parsers/xpathparser.rb @@ -215,7 +215,7 @@ def predicate_to_path(parsed, &block) else path << yield( parsed ) end - return path.squeeze(" ") + path.squeeze(" ") end # For backward compatibility alias_method :preciate_to_string, :predicate_to_path @@ -252,7 +252,7 @@ def LocationPath path, parsed path = path[1..-1] end end - return RelativeLocationPath( path, parsed ) if path.size > 0 + RelativeLocationPath( path, parsed ) if path.size > 0 end #RelativeLocationPath @@ -388,7 +388,7 @@ def NodeTest path, parsed else path = original_path end - return path + path end # Filters the supplied nodeset on the predicate(s) @@ -600,7 +600,7 @@ def PathExpr path, parsed end rest = LocationPath(rest, n) if rest =~ /\A[\/\.\@\[\w*]/ parsed.concat(n) - return rest + rest end #| FilterExpr Predicate diff --git a/lib/rexml/quickpath.rb b/lib/rexml/quickpath.rb index a0466b25..cded06f5 100644 --- a/lib/rexml/quickpath.rb +++ b/lib/rexml/quickpath.rb @@ -41,7 +41,7 @@ def QuickPath::match element, path, namespaces=EMPTY_HASH else results = filter([element], path) end - return results + results end # Given an array of nodes it filters the array based on the path. The @@ -51,18 +51,18 @@ def QuickPath::filter elements, path return elements if path.nil? or path == '' or elements.size == 0 case path when /^\/\//u # Descendant - return axe( elements, "descendant-or-self", $' ) + axe( elements, "descendant-or-self", $' ) when /^\/?\b(\w[-\w]*)\b::/u # Axe - return axe( elements, $1, $' ) + axe( elements, $1, $' ) when /^\/(?=\b([:!\w][-\.\w]*:)?[-!\*\.\w]*\b([^:(]|$)|\*)/u # Child rest = $' results = [] elements.each do |element| results |= filter( element.to_a, rest ) end - return results + results when /^\/?(\w[-\w]*)\(/u # / Function - return function( elements, $1, $' ) + function( elements, $1, $' ) when Namespace::NAMESPLIT # Element name name = $2 ns = $1 @@ -73,21 +73,21 @@ def QuickPath::filter elements, path (element.name == name and element.namespace == Functions.namespace_context[ns]))) end - return filter( elements, rest ) + filter( elements, rest ) when /^\/\[/u matches = [] elements.each do |element| matches |= predicate( element.to_a, path[1..-1] ) if element.kind_of? Element end - return matches + matches when /^\[/u # Predicate - return predicate( elements, path ) + predicate( elements, path ) when /^\/?\.\.\./u # Ancestor - return axe( elements, "ancestor", $' ) + axe( elements, "ancestor", $' ) when /^\/?\.\./u # Parent - return filter( elements.collect{|e|e.parent}, $' ) + filter( elements.collect{|e|e.parent}, $' ) when /^\/?\./u # Self - return filter( elements, $' ) + filter( elements, $' ) when /^\*/u # Any results = [] elements.each do |element| @@ -98,9 +98,10 @@ def QuickPath::filter elements, path # results |= filter( children, $' ) #end end - return results + results + else + [] end - return [] end def QuickPath::axe( elements, axe_name, rest ) @@ -138,7 +139,7 @@ def QuickPath::axe( elements, axe_name, rest ) matches = filter(elements.collect{|element| element.previous_sibling}.uniq, rest ) end - return matches.uniq + matches.uniq end OPERAND_ = '((?=(?:(?!and|or).)*[^\s<>=])[^\s<>=]+)' @@ -200,15 +201,15 @@ def QuickPath::predicate( elements, path ) results << element end end - return filter( results, rest ) + filter( results, rest ) end def QuickPath::attribute( name ) - return Functions.node.attributes[name] if Functions.node.kind_of? Element + Functions.node.attributes[name] if Functions.node.kind_of? Element end def QuickPath::name() - return Functions.node.name if Functions.node.kind_of? Element + Functions.node.name if Functions.node.kind_of? Element end def QuickPath::method_missing( id, *args ) @@ -234,7 +235,7 @@ def QuickPath::function( elements, fname, rest ) results << element if Functions.pair[0] == res end end - return results + results end def QuickPath::parse_args( element, string ) diff --git a/lib/rexml/security.rb b/lib/rexml/security.rb index 99b74607..e8e8c6b4 100644 --- a/lib/rexml/security.rb +++ b/lib/rexml/security.rb @@ -10,7 +10,7 @@ def self.entity_expansion_limit=( val ) # Get the entity expansion limit. By default the limit is set to 10000. def self.entity_expansion_limit - return @@entity_expansion_limit + @@entity_expansion_limit end @@entity_expansion_text_limit = 10_240 @@ -22,7 +22,7 @@ def self.entity_expansion_text_limit=( val ) # Get the entity expansion limit. By default the limit is set to 10240. def self.entity_expansion_text_limit - return @@entity_expansion_text_limit + @@entity_expansion_text_limit end end end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index c70f73f2..8799d89d 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -177,7 +177,7 @@ def empty? def clone - return Text.new(self, true) + Text.new(self, true) end @@ -261,10 +261,10 @@ def wrap(string, width, addnewline=false) # Recursively wrap string at width. return string if string.length <= width place = string.rindex(' ', width) # Position in string with last ' ' before cutoff - if addnewline then - return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) + if addnewline + "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) else - return string[0,place] + "\n" + wrap(string[place+1..-1], width) + string[0,place] + "\n" + wrap(string[place+1..-1], width) end end @@ -277,7 +277,7 @@ def indent_text(string, level=1, style="\t", indentfirstline=true) new_string << new_line } new_string.strip! unless indentfirstline - return new_string + new_string end # == DEPRECATED @@ -296,9 +296,7 @@ def write( writer, indent=-1, transitive=false, ie_hack=false ) # FIXME # This probably won't work properly def xpath - path = @parent.xpath - path += "/text()" - return path + @parent.xpath + "/text()" end # Writes out text, substituting special characters beforehand. diff --git a/lib/rexml/validation/relaxng.rb b/lib/rexml/validation/relaxng.rb index f29a2c05..c6894dcb 100644 --- a/lib/rexml/validation/relaxng.rb +++ b/lib/rexml/validation/relaxng.rb @@ -157,16 +157,16 @@ def next( event ) if ( @events[@current].matches?(event) ) @current += 1 if @events[@current].nil? - return @previous.pop + @previous.pop elsif @events[@current].kind_of? State @current += 1 @events[@current-1].previous = self - return @events[@current-1] + @events[@current-1] else - return self + self end else - return nil + nil end end @@ -186,7 +186,7 @@ def inspect end def expected - return [@events[@current]] + [@events[@current]] end def <<( event ) @@ -244,7 +244,7 @@ def generate_event( event ) evt = :end_attribute end end - return Event.new( evt, arg ) + Event.new( evt, arg ) end end @@ -262,9 +262,10 @@ def next( event ) rv = super return rv if rv @prior = @previous.pop - return @prior.next( event ) + @prior.next( event ) + else + super end - super end def matches?(event) @@ -274,7 +275,7 @@ def matches?(event) def expected return [ @prior.expected, @events[0] ].flatten if @current == 0 - return [@events[@current]] + [@events[@current]] end end @@ -286,24 +287,24 @@ def next( event ) @current += 1 if @events[@current].nil? @current = 0 - return self + self elsif @events[@current].kind_of? State @current += 1 @events[@current-1].previous = self - return @events[@current-1] + @events[@current-1] else - return self + self end else @prior = @previous.pop return @prior.next( event ) if @current == 0 - return nil + nil end end def expected return [ @prior.expected, @events[0] ].flatten if @current == 0 - return [@events[@current]] + [@events[@current]] end end @@ -326,17 +327,17 @@ def next( event ) @ord += 1 if @events[@current].nil? @current = 0 - return self + self elsif @events[@current].kind_of? State @current += 1 @events[@current-1].previous = self - return @events[@current-1] + @events[@current-1] else - return self + self end else return @previous.pop.next( event ) if @current == 0 and @ord > 0 - return nil + nil end end @@ -347,9 +348,9 @@ def matches?( event ) def expected if @current == 0 and @ord > 0 - return [@previous[-1].expected, @events[0]].flatten + [@previous[-1].expected, @events[0]].flatten else - return [@events[@current]] + [@events[@current]] end end end @@ -403,7 +404,7 @@ def matches?( event ) def expected return [@events[@current]] if @events.size > 0 - return @choices.collect do |x| + @choices.collect do |x| if x[0].kind_of? State x[0].expected else @@ -490,16 +491,16 @@ def next( event ) @current += 1 if @events[@current].nil? return self unless @choices[@choice].nil? - return @previous.pop + @previous.pop elsif @events[@current].kind_of? State @current += 1 @events[@current-1].previous = self - return @events[@current-1] + @events[@current-1] else - return self + self end else - return nil + nil end end @@ -510,7 +511,7 @@ def matches?( event ) def expected return [@events[@current]] if @events[@current] - return @choices[@choice..-1].collect do |x| + @choices[@choice..-1].collect do |x| if x[0].kind_of? State x[0].expected else diff --git a/lib/rexml/validation/validation.rb b/lib/rexml/validation/validation.rb index 0ad6ada4..6475c628 100644 --- a/lib/rexml/validation/validation.rb +++ b/lib/rexml/validation/validation.rb @@ -80,26 +80,26 @@ def done? end def single? - return (@event_type != :start_element and @event_type != :start_attribute) + (@event_type != :start_element and @event_type != :start_attribute) end def matches?( event ) return false unless event[0] == @event_type case event[0] when nil - return true + true when :start_element - return true if event[1] == @event_arg + event[1] == @event_arg when :end_element - return true + true when :start_attribute - return true if event[1] == @event_arg + event[1] == @event_arg when :end_attribute - return true + true when :end_document - return true + true when :text - return (@event_arg.nil? or @event_arg == event[1]) + @event_arg.nil? || @event_arg == event[1] =begin when :processing_instruction false diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 70ae8919..5cf3f28c 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -114,7 +114,7 @@ def first( path_stack, node ) case path[0] when :document # do nothing - return first( path[1..-1], node ) + first( path[1..-1], node ) when :child for c in node.children r = first( path[1..-1], c ) @@ -124,9 +124,9 @@ def first( path_stack, node ) name = path[2] if node.name == name return node if path.size == 3 - return first( path[3..-1], node ) + first( path[3..-1], node ) else - return nil + nil end when :descendant_or_self r = first( path[1..-1], node ) @@ -136,11 +136,12 @@ def first( path_stack, node ) return r if r end when :node - return first( path[1..-1], node ) + first( path[1..-1], node ) when :any - return first( path[1..-1], node ) + first( path[1..-1], node ) + else + nil end - return nil end @@ -167,10 +168,10 @@ def strict? # 2. If no mapping was supplied, use the context node to look up the namespace def get_namespace( node, prefix ) if @namespaces - return @namespaces[prefix] || '' + @namespaces[prefix] || '' else return node.namespace( prefix ) if node.node_type == :element - return '' + '' end end @@ -757,22 +758,19 @@ def following(node) end def following_node_of( node ) - if node.kind_of? Element and node.children.size > 0 - return node.children[0] - end - return next_sibling_node(node) + return node.children[0] if node.kind_of?(Element) and node.children.size > 0 + + next_sibling_node(node) end def next_sibling_node(node) psn = node.next_sibling_node while psn.nil? - if node.parent.nil? or node.parent.class == Document - return nil - end + return nil if node.parent.nil? or node.parent.class == Document node = node.parent psn = node.next_sibling_node end - return psn + psn end def child(nodeset) @@ -805,13 +803,13 @@ def child(nodeset) def norm b case b when true, false - return b + b when 'true', 'false' - return Functions::boolean( b ) + Functions::boolean( b ) when /^\d+(\.\d+)?$/, Numeric - return Functions::number( b ) + Functions::number( b ) else - return Functions::string( b ) + Functions::string( b ) end end From 04a589a61bf4e366abee8764ee74b03f4aecc4aa Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 08:17:16 +0700 Subject: [PATCH 28/34] Fix a bug that XPath can't be used for no document element (#268) Fixes #267 #249 improved performance by introducing cache. It requires document but we should not break backward compatibility for performance improvement. This restores the previous behavior but no document case doesn't have performance improvement introduced by #249. --- lib/rexml/xpath_parser.rb | 7 ++++++- test/parser/test_xpath.rb | 2 +- test/test_xpath_parser.rb | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 test/test_xpath_parser.rb diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 5cf3f28c..64c8846a 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -84,7 +84,12 @@ def parse path, node node = node.first end - node.document.__send__(:enable_cache) do + document = node.document + if document + document.__send__(:enable_cache) do + match( path_stack, node ) + end + else match( path_stack, node ) end end diff --git a/test/parser/test_xpath.rb b/test/parser/test_xpath.rb index 9143d25c..5d62afee 100644 --- a/test/parser/test_xpath.rb +++ b/test/parser/test_xpath.rb @@ -4,7 +4,7 @@ require "rexml/parsers/xpathparser" module REXMLTests - class TestXPathParser < Test::Unit::TestCase + class TestParserXPathParser < Test::Unit::TestCase sub_test_case("#abbreviate") do def abbreviate(xpath) parser = REXML::Parsers::XPathParser.new diff --git a/test/test_xpath_parser.rb b/test/test_xpath_parser.rb new file mode 100644 index 00000000..bcb14c34 --- /dev/null +++ b/test/test_xpath_parser.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +module REXMLTests + class TestXPathParser < Test::Unit::TestCase + def setup + @root_element = make_service_element(["urn:type1", "urn:type2"], ["http://uri"]) + @element = @root_element.children[0] + @parser = REXML::XPathParser.new + end + + def make_service_element(types, uris) + root_element = REXML::Element.new + element = root_element.add_element("Service") + types.each do |type_text| + element.add_element("Type").text = type_text + end + uris.each do |uri_text| + element.add_element("URI").text = uri_text + end + root_element + end + + def test_found + res = @parser.parse("/Service", @root_element) + assert_equal([@element], + res) + end + + def test_not_found + res = @parser.parse("/nonexistent", @root_element) + assert_equal([], + res) + end + end +end From 9b084d78708638cedff54743edc0907c4bd6574a Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Tue, 15 Jul 2025 08:23:42 +0700 Subject: [PATCH 29/34] Fix & Deprecate REXML::Text#text_indent (#275) - Fixes #273 - "Fix" in the sense that it restores the original behavior pre-v3.2.6, regardless of its fitness for purpose. - Regression Test Added --------- Co-authored-by: Sutou Kouhei --- lib/rexml/child.rb | 2 +- lib/rexml/text.rb | 4 +++- test/test_text.rb | 8 ++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/rexml/child.rb b/lib/rexml/child.rb index 40abde87..2718040f 100644 --- a/lib/rexml/child.rb +++ b/lib/rexml/child.rb @@ -88,7 +88,7 @@ def document # This doesn't yet handle encodings def bytes - document.encoding + document&.encoding to_s end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 8799d89d..8d5281cd 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -269,8 +269,10 @@ def wrap(string, width, addnewline=false) end def indent_text(string, level=1, style="\t", indentfirstline=true) + Kernel.warn("#{self.class.name}#indent_text is deprecated. See REXML::Formatters", uplevel: 1) return string if level < 0 - new_string = '' + + new_string = +'' string.each_line { |line| indent_string = style * level new_line = (indent_string + line).sub(/[\s]+$/,'') diff --git a/test/test_text.rb b/test/test_text.rb index bae21656..c1f5765e 100644 --- a/test/test_text.rb +++ b/test/test_text.rb @@ -2,6 +2,7 @@ module REXMLTests class TextTester < Test::Unit::TestCase + include Helper::Global include REXML def test_new_text_response_whitespace_default @@ -69,5 +70,12 @@ def test_clone assert_equal(text.to_s, text.clone.to_s) end + + def test_indent_text + text = Text.new("") + suppress_warning do + assert_equal("\tline1\tline2\tline3", text.indent_text("line1\r\nline2\r\nline3\r\n")) + end + end end end From c60ae027a3c20f359fdf76fa41ae64d22313f482 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 16 Jul 2025 08:22:47 +0700 Subject: [PATCH 30/34] Remove bundler from dev deps (#277) Fixes #276 It's redundant. --- Gemfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Gemfile b/Gemfile index d323e2c5..a680c133 100644 --- a/Gemfile +++ b/Gemfile @@ -6,7 +6,6 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } gemspec group :development do - gem "bundler" # This is for suppressing the following warning: # # warning: ostruct was loaded from the standard library, but will From c87bda8bb8773da7e5a0faf9f16ff165eb052a35 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 16 Jul 2025 08:39:31 +0700 Subject: [PATCH 31/34] Remove ostruct from dev deps (#281) Fixes #280 It seems that it's no longer needed. --- Gemfile | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Gemfile b/Gemfile index a680c133..22520c65 100644 --- a/Gemfile +++ b/Gemfile @@ -6,14 +6,6 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } gemspec group :development do - # This is for suppressing the following warning: - # - # warning: ostruct was loaded from the standard library, but will - # no longer be part of the default gems starting from Ruby 3.5.0. - # - # This should be part of "json". We can remove this when "json" - # depends on "ostruct" explicitly. - gem "ostruct" gem "rake" gem "rdoc" end From 1d876e3bf658b7b4ec7c3372867521695e8eb023 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 19 Aug 2025 09:38:38 +0900 Subject: [PATCH 32/34] Bump actions/checkout from 4 to 5 (#283) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/checkout](https://github.com/actions/checkout) from 4 to 5.
Release notes

Sourced from actions/checkout's releases.

v5.0.0

What's Changed

⚠️ Minimum Compatible Runner Version

v2.327.1
Release Notes

Make sure your runner is updated to this version or newer to use this release.

Full Changelog: https://github.com/actions/checkout/compare/v4...v5.0.0

v4.3.0

What's Changed

New Contributors

Full Changelog: https://github.com/actions/checkout/compare/v4...v4.3.0

v4.2.2

What's Changed

Full Changelog: https://github.com/actions/checkout/compare/v4.2.1...v4.2.2

v4.2.1

What's Changed

New Contributors

Full Changelog: https://github.com/actions/checkout/compare/v4.2.0...v4.2.1

... (truncated)

Changelog

Sourced from actions/checkout's changelog.

Changelog

V5.0.0

V4.3.0

v4.2.2

v4.2.1

v4.2.0

v4.1.7

v4.1.6

v4.1.5

v4.1.4

v4.1.3

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/checkout&package-manager=github_actions&previous-version=4&new-version=5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/benchmark.yml | 2 +- .github/workflows/release.yml | 4 ++-- .github/workflows/test.yml | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 2c638b03..651df879 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -16,7 +16,7 @@ jobs: - ubuntu-latest runs-on: ${{ matrix.runs-on }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 76269f44..f3dffca7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 10 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Extract release note run: | ruby \ @@ -37,7 +37,7 @@ jobs: id-token: write environment: release steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ruby diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0bd43457..31dc02a2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,7 +27,7 @@ jobs: # - runs-on: ubuntu-latest # ruby-version: truffleruby steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} @@ -39,7 +39,7 @@ jobs: name: frozen-string-literal runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ruby @@ -66,7 +66,7 @@ jobs: - windows-latest ruby-version: ${{ fromJson(needs.ruby-versions-gems.outputs.versions) }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} @@ -95,7 +95,7 @@ jobs: name: "Document" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ruby @@ -105,7 +105,7 @@ jobs: - name: Build document run: | bundle exec rake warning:error rdoc - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 if: | github.event_name == 'push' with: From 5859bdeac792687eaf93d8e8f0b7e3c1e2ed5c23 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 23 Aug 2025 08:11:58 +0900 Subject: [PATCH 33/34] Added XML declaration check & `Source#skip_spaces` method (#282) ## Why? ### Added XML declaration check - The version attribute is required in XML declaration. - Only version attribute, encoding attribute, and standalone attribute are allowed in XML declaration. - XML declaration is only allowed once. See: https://www.w3.org/TR/xml/#NT-XMLDecl ### Added `Source#skip_spaces` method In the case of `@source.match?(/\s+/um, true)`, if there are no spaces at the beginning, I want to stop reading immediately. However, it continues to read the buffer until it finds a match, but it never finds a match. As a result, it continues reading until the end of the file. In the case of large XML files, drop_parsed_content occur frequently until the buffer is cleared, which may affect performance. ## Benchmark ``` before after before(YJIT) after(YJIT) dom 32.534 35.130 54.559 53.528 i/s - 100.000 times in 3.073715s 2.846540s 1.832883s 1.868189s sax 44.785 44.089 78.303 77.842 i/s - 100.000 times in 2.232907s 2.268138s 1.277093s 1.284657s pull 51.750 51.105 90.819 90.658 i/s - 100.000 times in 1.932351s 1.956759s 1.101094s 1.103050s stream 51.427 51.444 89.820 88.971 i/s - 100.000 times in 1.944502s 1.943855s 1.113340s 1.123960s Comparison: dom before(YJIT): 54.6 i/s after(YJIT): 53.5 i/s - 1.02x slower after: 35.1 i/s - 1.55x slower before: 32.5 i/s - 1.68x slower sax before(YJIT): 78.3 i/s after(YJIT): 77.8 i/s - 1.01x slower before: 44.8 i/s - 1.75x slower after: 44.1 i/s - 1.78x slower pull before(YJIT): 90.8 i/s after(YJIT): 90.7 i/s - 1.00x slower before: 51.8 i/s - 1.75x slower after: 51.1 i/s - 1.78x slower stream before(YJIT): 89.8 i/s after(YJIT): 89.0 i/s - 1.01x slower after: 51.4 i/s - 1.75x slower before: 51.4 i/s - 1.75x slower ``` - YJIT=ON : 0.98x - 1.00x faster - YJIT=OFF : 0.98x - 1.07x faster --- lib/rexml/parsers/baseparser.rb | 156 +++++++++++++------ lib/rexml/source.rb | 7 +- test/parse/test_document_type_declaration.rb | 6 +- test/parse/test_processing_instruction.rb | 130 +++++++++++++++- test/test_xml_declaration.rb | 2 +- 5 files changed, 244 insertions(+), 57 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index a87657b5..9304e96d 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -144,6 +144,7 @@ module Private PEREFERENCE_PATTERN = /#{PEREFERENCE}/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um + EQUAL_PATTERN = /\s*=\s*/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um NAME_PATTERN = /#{NAME}/um GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" @@ -168,6 +169,7 @@ def initialize( source ) @entity_expansion_limit = Security.entity_expansion_limit @entity_expansion_text_limit = Security.entity_expansion_text_limit @source.ensure_buffer + @version = nil end def add_listener( listener ) @@ -280,7 +282,7 @@ def pull_event return [ :comment, process_comment ] elsif @source.match?("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" - unless @source.match?(/\s+/um, true) + unless @source.skip_spaces if @source.match?(">") message = "#{base_error_message}: name is missing" else @@ -290,7 +292,7 @@ def pull_event raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces if @source.match?("[", true) id = [nil, nil, nil] @document_status = :in_doctype @@ -306,7 +308,7 @@ def pull_event # For backward compatibility id[1], id[2] = id[2], nil end - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces if @source.match?("[", true) @document_status = :in_doctype elsif @source.match?(">", true) @@ -319,7 +321,7 @@ def pull_event end args = [:start_doctype, name, *id] if @document_status == :after_doctype - @source.match?(/\s*/um, true) + @source.skip_spaces @stack << [ :end_doctype ] end return args @@ -330,7 +332,7 @@ def pull_event end end if @document_status == :in_doctype - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces start_position = @source.position if @source.match?("") message = "#{base_error_message}: name is missing" else @@ -404,7 +406,7 @@ def pull_event id = parse_id(base_error_message, accept_external_id: true, accept_public_id: true) - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces unless @source.match?(">", true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) @@ -425,7 +427,7 @@ def pull_event end end if @document_status == :after_doctype - @source.match?(/\s*/um, true) + @source.skip_spaces end begin start_position = @source.position @@ -642,6 +644,10 @@ def need_source_encoding_update?(xml_declaration_encoding) true end + def normalize_xml_declaration_encoding(xml_declaration_encoding) + /\AUTF-16(?:BE|LE)\z/i.match?(xml_declaration_encoding) ? "UTF-16" : nil + end + def parse_name(base_error_message) md = @source.match(Private::NAME_PATTERN, true) unless md @@ -735,37 +741,85 @@ def process_comment def process_instruction name = parse_name("Malformed XML: Invalid processing instruction node") - if @source.match?(/\s+/um, true) - match_data = @source.match(/(.*?)\?>/um, true) - unless match_data - raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + if name == "xml" + xml_declaration + else # PITarget + if @source.skip_spaces # e.g. + start_position = @source.position + content = @source.read_until("?>") + unless content.chomp!("?>") + @source.position = start_position + raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source) + end + else # e.g. + content = nil + unless @source.match?("?>", true) + raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source) + end end - content = match_data[1] - else - content = nil + [:processing_instruction, name, content] + end + end + + def xml_declaration + unless @version.nil? + raise ParseException.new("Malformed XML: XML declaration is duplicated", @source) + end + if @document_status + raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) + end + unless @source.skip_spaces + raise ParseException.new("Malformed XML: XML declaration misses spaces before version", @source) + end + unless @source.match?("version", true) + raise ParseException.new("Malformed XML: XML declaration misses version", @source) + end + @version = parse_attribute_value_with_equal("xml") + unless @source.skip_spaces unless @source.match?("?>", true) - raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + raise ParseException.new("Malformed XML: Unclosed XML declaration", @source) end + encoding = normalize_xml_declaration_encoding(@source.encoding) + return [ :xmldecl, @version, encoding, nil ] # e.g. end - if name == "xml" - if @document_status - raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) - end - version = VERSION.match(content) - version = version[1] unless version.nil? - encoding = ENCODING.match(content) - encoding = encoding[1] unless encoding.nil? - if need_source_encoding_update?(encoding) - @source.encoding = encoding + + if @source.match?("encoding", true) + encoding = parse_attribute_value_with_equal("xml") + unless @source.skip_spaces + unless @source.match?("?>", true) + raise ParseException.new("Malformed XML: Unclosed XML declaration", @source) + end + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + encoding ||= normalize_xml_declaration_encoding(@source.encoding) + return [ :xmldecl, @version, encoding, nil ] # e.g. end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" + end + + if @source.match?("standalone", true) + standalone = parse_attribute_value_with_equal("xml") + case standalone + when "yes", "no" + else + raise ParseException.new("Malformed XML: XML declaration standalone is not yes or no : <#{standalone}>", @source) end - standalone = STANDALONE.match(content) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone ] end - [:processing_instruction, name, content] + @source.skip_spaces + unless @source.match?("?>", true) + raise ParseException.new("Malformed XML: Unclosed XML declaration", @source) + end + + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + encoding ||= normalize_xml_declaration_encoding(@source.encoding) + + # e.g. + # + # + # + [ :xmldecl, @version, encoding, standalone ] end if StringScanner::Version < "3.1.1" @@ -787,6 +841,25 @@ def scan_quote end end + def parse_attribute_value_with_equal(name) + unless @source.match?(Private::EQUAL_PATTERN, true) + message = "Missing attribute equal: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + unless quote = scan_quote + message = "Missing attribute value start quote: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + start_position = @source.position + value = @source.read_until(quote) + unless value.chomp!(quote) + @source.position = start_position + message = "Missing attribute value end quote: <#{name}>: <#{quote}>" + raise REXML::ParseException.new(message, @source) + end + value + end + def parse_attributes(prefixes) attributes = {} expanded_names = {} @@ -801,23 +874,8 @@ def parse_attributes(prefixes) name = match[1] prefix = match[2] local_part = match[3] - - unless @source.match?(/\s*=\s*/um, true) - message = "Missing attribute equal: <#{name}>" - raise REXML::ParseException.new(message, @source) - end - unless quote = scan_quote - message = "Missing attribute value start quote: <#{name}>" - raise REXML::ParseException.new(message, @source) - end - start_position = @source.position - value = @source.read_until(quote) - unless value.chomp!(quote) - @source.position = start_position - message = "Missing attribute value end quote: <#{name}>: <#{quote}>" - raise REXML::ParseException.new(message, @source) - end - @source.match?(/\s*/um, true) + value = parse_attribute_value_with_equal(name) + @source.skip_spaces if prefix == "xmlns" if local_part == "xml" if value != Private::XML_PREFIXED_NAMESPACE diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 3ec1141e..99500072 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -65,9 +65,10 @@ class Source attr_reader :encoding module Private + SPACES_PATTERN = /\s+/um SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} - pre_defined_terms = ["'", '"', "<", "]]>"] + pre_defined_terms = ["'", '"', "<", "]]>", "?>"] if StringScanner::Version < "3.1.1" pre_defined_terms.each do |term| PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ @@ -150,6 +151,10 @@ def match?(pattern, cons=false) end end + def skip_spaces + @scanner.skip(Private::SPACES_PATTERN) ? true : false + end + def position @scanner.pos end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index b22863a9..d4658b9e 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -49,10 +49,10 @@ def test_no_name end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed DOCTYPE: name is missing -Line: 3 -Position: 17 +Line: 1 +Position: 10 Last 80 unconsumed characters: - + DETAIL end end diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index ba381dc4..70d17747 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -30,7 +30,7 @@ def test_unclosed_content parse(" Line: 1 Position: 14 Last 80 unconsumed characters: @@ -43,7 +43,7 @@ def test_unclosed_no_content parse(" Line: 1 Position: 6 Last 80 unconsumed characters: @@ -51,6 +51,19 @@ def test_unclosed_no_content DETAIL end + def test_xml_declaration_duplicated + exception = assert_raise(REXML::ParseException) do + parse('') + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed XML: XML declaration is duplicated +Line: 1 +Position: 42 +Last 80 unconsumed characters: + version="1.0"?> + DETAIL + end + def test_xml_declaration_not_at_document_start exception = assert_raise(REXML::ParseException) do parser = REXML::Parsers::BaseParser.new('') @@ -64,7 +77,118 @@ def test_xml_declaration_not_at_document_start Line: 1 Position: 25 Last 80 unconsumed characters: + version="1.0" ?> + DETAIL + end + + def test_xml_declaration_missing_spaces + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration misses spaces before version + Line: 1 + Position: 7 + Last 80 unconsumed characters: + ?> + DETAIL + end + + def test_xml_declaration_missing_version + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration misses version + Line: 1 + Position: 8 + Last 80 unconsumed characters: + ?> + DETAIL + end + + def test_xml_declaration_unclosed_content + exception = assert_raise(REXML::ParseException) do + parse('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Unclosed XML declaration + Line: 1 + Position: 37 + Last 80 unconsumed characters: + encoding="UTF-8"?> + DETAIL + end + + def test_xml_declaration_unclosed_content_missing_space_after_encoding + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Unclosed XML declaration + Line: 1 + Position: 53 + Last 80 unconsumed characters: + standalone="no"?> + DETAIL + end + + def test_xml_declaration_unclosed_content_with_unknown_attributes + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Unclosed XML declaration + Line: 1 + Position: 31 + Last 80 unconsumed characters: + test="no"?> + DETAIL + end + + def test_xml_declaration_standalone_no_yes_or_no + exception = assert_raise(REXML::ParseException) do + parse('') + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed XML: XML declaration standalone is not yes or no : +Line: 1 +Position: 38 +Last 80 unconsumed characters: +?> DETAIL end end @@ -113,7 +237,7 @@ def test_content_question def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new("" * n + " ?>") + REXML::Document.new("" * n + " ?>") end end diff --git a/test/test_xml_declaration.rb b/test/test_xml_declaration.rb index 6a1f4df0..4503a90e 100644 --- a/test/test_xml_declaration.rb +++ b/test/test_xml_declaration.rb @@ -7,7 +7,7 @@ module REXMLTests class TestXmlDeclaration < Test::Unit::TestCase def setup xml = <<~XML - + XML From f36916fe1c66b8cdc1fe482263115625e084d8fe Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 26 Aug 2025 14:29:47 +0900 Subject: [PATCH 34/34] Add 3.4.2 entry (#284) --- NEWS.md | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/NEWS.md b/NEWS.md index 7f95d829..313b07d5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,107 @@ # News +## 3.4.2 - 2025-08-26 {#version-3-4-2} + +### Improvement + + * Improved performance. + * GH-244 + * GH-245 + * GH-246 + * GH-249 + * GH-256 + * Patch by NAITOH Jun + + * Raise appropriate exception when failing to match start tag in DOCTYPE + * GH-247 + * Patch by NAITOH Jun + + * Deprecate accepting array as an element in XPath.match, first and each + * GH-252 + * Patch by tomoya ishida + + * Don't call needless encoding_updated + * GH-259 + * Patch by Sutou Kouhei + + * Reuse XPath::match + * GH-263 + * Patch by pboling + + * Cache redundant calls for doctype + * GH-264 + * Patch by pboling + + * Use Safe Navigation (&.) from Ruby 2.3 + * GH-265 + * Patch by pboling + + * Remove redundant return statements + * GH-266 + * Patch by pboling + + * Added XML declaration check & Source#skip_spaces method + * GH-282 + * Patch by NAITOH Jun + * Reported by Sofi Aberegg + +### Fixes + + * Fix docs typo + * GH-248 + * Patch by James Coleman + + * Fix reverse sort in xpath_parser + * GH-251 + * Patch by tomoya ishida + + * Fix duplicate responses in XPath following, following-sibling, preceding, preceding-sibling + * GH-255 + * Patch by NAITOH Jun + + * Fix wrong Encoding resolution + * GH-258 + * Patch by Sutou Kouhei + + * Handle nil when parsing fragment + * GH-267 + * GH-268 + * Patch by pboling + + * [Documentation] Use # to reference instance methods + * GH-269 + * GH-270 + * Patch by pboling + + * Fix & Deprecate REXML::Text#text_indent + * GH-273 + * GH-275 + * Patch by pboling + + * remove bundler from dev deps + * GH-276 + * GH-277 + * Patch by pboling + + * remove ostruct from dev deps + * GH-280 + * GH-281 + * Patch by pboling + +### Thanks + + * NAITOH Jun + + * tomoya ishida + + * James Coleman + + * pboling + + * Sutou Kouhei + + * Sofi Aberegg + ## 3.4.1 - 2025-02-16 {#version-3-4-1} ### Improvement