From 10856798f530987aeb9aef351e681b0a33e6f4ae Mon Sep 17 00:00:00 2001 From: Chris Guida Date: Thu, 14 Jan 2021 19:46:28 -0600 Subject: [PATCH 1/3] Add classes and utilities for parsing undo data in rev*.dat files --- blockchain_parser/block.py | 6 +- blockchain_parser/blockchain.py | 14 ++- blockchain_parser/input.py | 4 +- blockchain_parser/output.py | 4 +- blockchain_parser/tests/test_utils.py | 10 +-- blockchain_parser/transaction.py | 10 +-- blockchain_parser/undo.py | 122 ++++++++++++++++++++++++++ blockchain_parser/utils.py | 86 +++++++++++++++++- examples/ordered-blocks.py | 2 +- examples/undo-blocks.py | 16 ++++ 10 files changed, 254 insertions(+), 20 deletions(-) create mode 100644 blockchain_parser/undo.py create mode 100644 examples/undo-blocks.py diff --git a/blockchain_parser/block.py b/blockchain_parser/block.py index f00db41..adbdb78 100644 --- a/blockchain_parser/block.py +++ b/blockchain_parser/block.py @@ -11,7 +11,7 @@ from .transaction import Transaction from .block_header import BlockHeader -from .utils import format_hash, decode_varint, double_sha256 +from .utils import format_hash, decode_compactsize, double_sha256 def get_block_transactions(raw_hex): @@ -23,7 +23,7 @@ def get_block_transactions(raw_hex): # Decoding the number of transactions, offset is the size of # the varint (1 to 9 bytes) - n_transactions, offset = decode_varint(transaction_data) + n_transactions, offset = decode_compactsize(transaction_data) for i in range(n_transactions): # Try from 1024 (1KiB) -> 1073741824 (1GiB) slice widths @@ -77,7 +77,7 @@ def n_transactions(self): as there's no need to parse all transactions to get this information """ if self._n_transactions is None: - self._n_transactions = decode_varint(self.hex[80:])[0] + self._n_transactions = decode_compactsize(self.hex[80:])[0] return self._n_transactions diff --git a/blockchain_parser/blockchain.py b/blockchain_parser/blockchain.py index 454e7c2..5bd82ae 100644 --- a/blockchain_parser/blockchain.py +++ b/blockchain_parser/blockchain.py @@ -42,10 +42,22 @@ def get_files(path): files = map(lambda x: os.path.join(path, x), files) return sorted(files) +def get_undo_files(path): + """ + Given the path to the .bitcoin directory, returns the sorted list of rev*.dat + files contained in that directory + """ + if not stat.S_ISDIR(os.stat(path)[stat.ST_MODE]): + return [path] + files = os.listdir(path) + files = [f for f in files if f.startswith("rev") and f.endswith(".dat")] + files = map(lambda x: os.path.join(path, x), files) + return sorted(files) + def get_blocks(blockfile): """ - Given the name of a .blk file, for every block contained in the file, + Given the name of a .dat file, for every block contained in the file, yields its raw hexadecimal value """ with open(blockfile, "rb") as f: diff --git a/blockchain_parser/input.py b/blockchain_parser/input.py index 564d9ae..09d974b 100644 --- a/blockchain_parser/input.py +++ b/blockchain_parser/input.py @@ -9,7 +9,7 @@ # modified, propagated, or distributed except according to the terms contained # in the LICENSE file. -from .utils import decode_varint, decode_uint32, format_hash +from .utils import decode_compactsize, decode_uint32, format_hash from .script import Script @@ -23,7 +23,7 @@ def __init__(self, raw_hex): self._sequence_number = None self._witnesses = [] - self._script_length, varint_length = decode_varint(raw_hex[36:]) + self._script_length, varint_length = decode_compactsize(raw_hex[36:]) self._script_start = 36 + varint_length self.size = self._script_start + self._script_length + 4 diff --git a/blockchain_parser/output.py b/blockchain_parser/output.py index 5ef1a58..7a9011b 100644 --- a/blockchain_parser/output.py +++ b/blockchain_parser/output.py @@ -9,7 +9,7 @@ # modified, propagated, or distributed except according to the terms contained # in the LICENSE file. -from .utils import decode_varint, decode_uint64 +from .utils import decode_compactsize, decode_uint64, decode_varint, decompress_txout_amt from .script import Script from .address import Address @@ -22,7 +22,7 @@ def __init__(self, raw_hex): self._script = None self._addresses = None - script_length, varint_size = decode_varint(raw_hex[8:]) + script_length, varint_size = decode_compactsize(raw_hex[8:]) script_start = 8 + varint_size self._script_hex = raw_hex[script_start:script_start+script_length] diff --git a/blockchain_parser/tests/test_utils.py b/blockchain_parser/tests/test_utils.py index e70cb25..9acde55 100644 --- a/blockchain_parser/tests/test_utils.py +++ b/blockchain_parser/tests/test_utils.py @@ -44,12 +44,12 @@ def test_decode_uint64(self): for uint64, value in uint64_dict.items(): self.assertEqual(utils.decode_uint64(a2b_hex(uint64)), value) - def test_decode_varint(self): + def test_decode_compactsize(self): case1 = a2b_hex("fa") - self.assertEqual(utils.decode_varint(case1), (250, 1)) + self.assertEqual(utils.decode_compactsize(case1), (250, 1)) case2 = a2b_hex("fd0100") - self.assertEqual(utils.decode_varint(case2), (1, 3)) + self.assertEqual(utils.decode_compactsize(case2), (1, 3)) case3 = a2b_hex("fe01000000") - self.assertEqual(utils.decode_varint(case3), (1, 5)) + self.assertEqual(utils.decode_compactsize(case3), (1, 5)) case4 = a2b_hex("ff0100000000000000") - self.assertEqual(utils.decode_varint(case4), (1, 9)) + self.assertEqual(utils.decode_compactsize(case4), (1, 9)) diff --git a/blockchain_parser/transaction.py b/blockchain_parser/transaction.py index 9277882..9dd8a79 100644 --- a/blockchain_parser/transaction.py +++ b/blockchain_parser/transaction.py @@ -11,7 +11,7 @@ from math import ceil -from .utils import decode_varint, decode_uint32, double_sha256, format_hash +from .utils import decode_compactsize, decode_uint32, double_sha256, format_hash from .input import Input from .output import Output @@ -44,7 +44,7 @@ def __init__(self, raw_hex): self.is_segwit = True offset += 2 - self.n_inputs, varint_size = decode_varint(raw_hex[offset:]) + self.n_inputs, varint_size = decode_compactsize(raw_hex[offset:]) offset += varint_size self.inputs = [] @@ -53,7 +53,7 @@ def __init__(self, raw_hex): offset += input.size self.inputs.append(input) - self.n_outputs, varint_size = decode_varint(raw_hex[offset:]) + self.n_outputs, varint_size = decode_compactsize(raw_hex[offset:]) offset += varint_size self.outputs = [] @@ -65,10 +65,10 @@ def __init__(self, raw_hex): if self.is_segwit: self._offset_before_tx_witnesses = offset for inp in self.inputs: - tx_witnesses_n, varint_size = decode_varint(raw_hex[offset:]) + tx_witnesses_n, varint_size = decode_compactsize(raw_hex[offset:]) offset += varint_size for j in range(tx_witnesses_n): - component_length, varint_size = decode_varint( + component_length, varint_size = decode_compactsize( raw_hex[offset:]) offset += varint_size witness = raw_hex[offset:offset + component_length] diff --git a/blockchain_parser/undo.py b/blockchain_parser/undo.py new file mode 100644 index 0000000..140e620 --- /dev/null +++ b/blockchain_parser/undo.py @@ -0,0 +1,122 @@ +# Copyright (C) 2015-2016 The bitcoin-blockchain-parser developers +# +# This file is part of bitcoin-blockchain-parser. +# +# It is subject to the license terms in the LICENSE file found in the top-level +# directory of this distribution. +# +# No part of bitcoin-blockchain-parser, including this file, may be copied, +# modified, propagated, or distributed except according to the terms contained +# in the LICENSE file. + +from .utils import decode_varint, decode_compactsize, decompress_txout_amt + +class BlockUndo(object): + """ + Represents a block of spent transaction outputs (coins), as encoded + in the undo rev*.dat files + """ + def __init__(self, raw_hex): + self._raw_hex = raw_hex + self.spends = [] + num_txs, pos = decode_compactsize(raw_hex) + # print("found %d" % num_txs + " transactions") + for i in range(num_txs): + # print("calling SpentOutput with raw_hex %s", raw_hex) + txn = SpentTransaction(raw_hex=raw_hex[pos:]) + self.spends.append(txn) + # print("found transaction #%d length %d hex: " % (i, txn.len), raw_hex[pos:pos+txn.len].hex()) + pos += txn.len + + +class SpentTransaction(object): + """Represents the script portion of a spent Transaction output""" + def __init__(self, raw_hex=None): + self._raw_hex = raw_hex + self.outputs = [] + # print("decoding compactsize for hex: ", raw_hex.hex()) + self.output_len, pos = decode_compactsize(raw_hex) + # print("found %d" % self.output_len + " outputs") + for i in range(self.output_len): + output = SpentOutput(raw_hex=raw_hex[pos:]) + self.outputs.append(output) + # print("found output #%d length %d hex: " % (i, output.len), raw_hex[pos:pos+output.len].hex()) + pos += output.len + self.len = pos + + @classmethod + def from_hex(cls, hex_): + return cls(hex_) + + +class SpentOutput(object): + """Represents a spent Transaction output""" + + def __init__(self, raw_hex=None): + # print("decoding output: ", raw_hex.hex()) + self._raw_hex = raw_hex + pos = 0 + # self.version = raw_hex[pos] + # pos += 1 + + # decode height code + height_code, height_code_len = decode_varint(raw_hex[pos:]) + # print("found height code : ", height_code, height_code_len) + if height_code % 2 == 1: + self.is_coinbase = True + height_code -= 1 + else: + self.is_coinbase = False + self.height = height_code // 2 + + # print("found height: ", self.height) + + # skip byte reserved only for backwards compatibility, should always be 0x00 + pos += height_code_len + 1 + + # decode compressed txout amount + compressed_amt, compressed_amt_len = decode_varint(raw_hex[pos:]) + self.amt = decompress_txout_amt(compressed_amt) + pos += compressed_amt_len + + # get script + script_hex, script_pub_key_len = SpentScriptPubKey.extract_from_hex(raw_hex[pos:]) + self.script_pub_key = SpentScriptPubKey(script_hex) + self.len = pos + self.script_pub_key.len + + @classmethod + def from_hex(cls, hex_): + return cls(hex_) + + +class SpentScriptPubKey(object): + """Represents the script portion of a spent Transaction output""" + def __init__(self, raw_hex=None): + self._raw_hex = raw_hex + self.len = len(raw_hex) + # self.script_hex = raw_hex[1:] + + @classmethod + def from_hex(cls, hex_): + return cls(hex_) + + @classmethod + def extract_from_hex(cls, raw_hex): + """ + docstring + """ + if raw_hex[0] in (0x00, 0x01): + return (raw_hex[:21], 21) + elif raw_hex[0] in (0x02, 0x03): + return (raw_hex[:33], 33) + elif raw_hex[0] in (0x04, 0x05): + # print("found strange script type: ", raw_hex[0]) + return (raw_hex[:33], 33) + else: + # print("found strange script type: ", raw_hex[0]) + # print("decoding compactsize for raw hex: ", raw_hex.hex()) + script_len_code, script_len_code_len = decode_varint(raw_hex) + # print("script_len_code, script_len_code_len: (%s, %s)" % (script_len_code, script_len_code_len)) + real_script_len = script_len_code - 6 + # print("real_script_len: %d" % real_script_len) + return (raw_hex[:script_len_code_len+real_script_len], real_script_len) diff --git a/blockchain_parser/utils.py b/blockchain_parser/utils.py index 18ce8d8..e54dd51 100644 --- a/blockchain_parser/utils.py +++ b/blockchain_parser/utils.py @@ -39,7 +39,7 @@ def decode_uint64(data): return struct.unpack(" 0) size = int(data[0]) assert(size <= 255) @@ -59,3 +59,87 @@ def decode_varint(data): size = struct.calcsize(format_) return struct.unpack(format_, data[1:size+1])[0], size + 1 + + +def decode_varint(raw_hex): + """ + Reads the weird format of VarInt present in src/serialize.h of bitcoin core + and being used for storing data in the leveldb. + This is not the VARINT format described for general bitcoin serialization + use. + """ + n = 0 + pos = 0 + while True: + try: + data = raw_hex[pos] + except IndexError as e: + print("IndexError caught on raw_hex: ", raw_hex, e) + raise e + pos += 1 + n = (n << 7) | (data & 0x7f) + if data & 0x80 == 0: + return n, pos + n += 1 + + +def decompress_txout_amt(amount_compressed_int): + # (this function stolen from https://github.com/sr-gi/bitcoin_tools and modified to remove bug) + # No need to do any work if it's zero. + if amount_compressed_int == 0: + return 0 + + # The decompressed amount is either of the following two equations: + # x = 1 + 10*(9*n + d - 1) + e + # x = 1 + 10*(n - 1) + 9 + amount_compressed_int -= 1 + + # The decompressed amount is now one of the following two equations: + # x = 10*(9*n + d - 1) + e + # x = 10*(n - 1) + 9 + exponent = amount_compressed_int % 10 + + # integer division + amount_compressed_int //= 10 + + # The decompressed amount is now one of the following two equations: + # x = 9*n + d - 1 | where e < 9 + # x = n - 1 | where e = 9 + n = 0 + if exponent < 9: + lastDigit = amount_compressed_int%9 + 1 + # integer division + amount_compressed_int //= 9 + n = amount_compressed_int*10 + lastDigit + else: + n = amount_compressed_int + 1 + + # Apply the exponent. + return n * 10**exponent + + +def compress_txout_amt(n): + """ Compresses the Satoshi amount of a UTXO to be stored in the LevelDB. Code is a port from the Bitcoin Core C++ + source: + https://github.com/bitcoin/bitcoin/blob/v0.13.2/src/compressor.cpp#L133#L160 + :param n: Satoshi amount to be compressed. + :type n: int + :return: The compressed amount of Satoshis. + :rtype: int + (this function stolen from https://github.com/sr-gi/bitcoin_tools and modified to remove bug) + """ + + if n == 0: + return 0 + e = 0 + while ((n % 10) == 0) and e < 9: + n //= 10 + e += 1 + + if e < 9: + d = (n % 10) + assert (1 <= d <= 9) + n //= 10 + return 1 + (n * 9 + d - 1) * 10 + e + else: + return 1 + (n - 1) * 10 + 9 diff --git a/examples/ordered-blocks.py b/examples/ordered-blocks.py index 061e804..426c9d4 100644 --- a/examples/ordered-blocks.py +++ b/examples/ordered-blocks.py @@ -10,4 +10,4 @@ # `index` directory (LevelDB index) being maintained by bitcoind. It contains # .ldb files and is present inside the `blocks` directory for block in blockchain.get_ordered_blocks(sys.argv[1] + '/index', end=1000): - print("height=%d block=%s" % (block.height, block.hash)) \ No newline at end of file + print("height=%d block=%s" % (block.height, block.hash)) diff --git a/examples/undo-blocks.py b/examples/undo-blocks.py new file mode 100644 index 0000000..4164324 --- /dev/null +++ b/examples/undo-blocks.py @@ -0,0 +1,16 @@ +import os +import plyvel +from blockchain_parser.blockchain import * +from blockchain_parser.output import * +from blockchain_parser.utils import * +from blockchain_parser.blockchain import Blockchain + +undo_files = get_undo_files(os.path.expanduser('~/.bitcoin/blocks')) +undo_block_ctr = 0 +for i, file_name in enumerate(undo_files): + print("parsing undo file #%d" % i) + for j, block_raw in enumerate(get_blocks(file_name)): + undo_block_ctr += 1 + if j % 1000 == 0 or (i == 1 and j > 9000): + print("parsing undo block #%d in file #%d block #%d" % (undo_block_ctr, i, j)) + block_undo_current = BlockUndo(block_raw) From e37d2deb4f90d684ccd9068a910110e8de8d08fd Mon Sep 17 00:00:00 2001 From: Chris Guida Date: Thu, 14 Jan 2021 19:57:29 -0600 Subject: [PATCH 2/3] Fix undo-blocks.py --- examples/undo-blocks.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/undo-blocks.py b/examples/undo-blocks.py index 4164324..34f7f88 100644 --- a/examples/undo-blocks.py +++ b/examples/undo-blocks.py @@ -1,9 +1,8 @@ import os -import plyvel from blockchain_parser.blockchain import * -from blockchain_parser.output import * -from blockchain_parser.utils import * from blockchain_parser.blockchain import Blockchain +from blockchain_parser.utils import * +from blockchain_parser.undo import * undo_files = get_undo_files(os.path.expanduser('~/.bitcoin/blocks')) undo_block_ctr = 0 From 173dfaf9a7d07bf9ade6d36f2a9658c4d3c9c6cd Mon Sep 17 00:00:00 2001 From: Chris Guida Date: Fri, 19 Jan 2024 15:04:01 -0600 Subject: [PATCH 3/3] save progress --- blockchain_parser/output.py | 2 +- blockchain_parser/undo.py | 66 ++++++++++++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/blockchain_parser/output.py b/blockchain_parser/output.py index 7a9011b..894fc81 100644 --- a/blockchain_parser/output.py +++ b/blockchain_parser/output.py @@ -9,7 +9,7 @@ # modified, propagated, or distributed except according to the terms contained # in the LICENSE file. -from .utils import decode_compactsize, decode_uint64, decode_varint, decompress_txout_amt +from .utils import decode_compactsize, decode_uint64 from .script import Script from .address import Address diff --git a/blockchain_parser/undo.py b/blockchain_parser/undo.py index 140e620..8c0f169 100644 --- a/blockchain_parser/undo.py +++ b/blockchain_parser/undo.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2016 The bitcoin-blockchain-parser developers +# Copyright (C) 2015-2020 The bitcoin-blockchain-parser developers # # This file is part of bitcoin-blockchain-parser. # @@ -11,6 +11,50 @@ from .utils import decode_varint, decode_compactsize, decompress_txout_amt +def decompress_script(raw_hex): + script_type = raw_hex[0] + compressed_script = raw_hex[1:] + + # def decompress_script(compressed_script, script_type): + """ Takes CScript as stored in leveldb and returns it in uncompressed form + (de)compression scheme is defined in bitcoin/src/compressor.cpp + :param compressed_script: raw script bytes hexlified (data in decode_utxo) + :type compressed_script: str + :param script_type: first byte of script data (out_type in decode_utxo) + :type script_type: int + :return: the decompressed CScript + :rtype: str + (this code adapted from https://github.com/sr-gi/bitcoin_tools) + """ + + if script_type == 0: + if len(compressed_script) != 20: + raise Exception("Compressed script has wrong size") + script = OutputScript.P2PKH(compressed_script, hash160=True) + + elif script_type == 1: + if len(compressed_script) != 20: + raise Exception("Compressed script has wrong size") + script = OutputScript.P2SH(compressed_script) + + elif script_type in [2, 3]: + if len(compressed_script) != 33: + raise Exception("Compressed script has wrong size") + script = OutputScript.P2PK(compressed_script) + + elif script_type in [4, 5]: + if len(compressed_script) != 33: + raise Exception("Compressed script has wrong size") + prefix = format(script_type - 2, '02') + script = OutputScript.P2PK(get_uncompressed_pk(prefix + compressed_script[2:])) + + else: + assert len(compressed_script) / 2 == script_type - NSPECIALSCRIPTS + script = OutputScript.from_hex(compressed_script) + + return script.content + + class BlockUndo(object): """ Represents a block of spent transaction outputs (coins), as encoded @@ -80,14 +124,22 @@ def __init__(self, raw_hex=None): pos += compressed_amt_len # get script - script_hex, script_pub_key_len = SpentScriptPubKey.extract_from_hex(raw_hex[pos:]) - self.script_pub_key = SpentScriptPubKey(script_hex) - self.len = pos + self.script_pub_key.len + script_hex, script_pub_key_compressed_len = SpentScriptPubKey.extract_from_hex(raw_hex[pos:]) + self.script_pub_key_compressed = SpentScriptPubKey(script_hex) + self.len = pos + self.script_pub_key_compressed.len @classmethod def from_hex(cls, hex_): return cls(hex_) + + @property + def script(self): + if not self.script: + self.script = decompress_script(self.script_pub_key_compressed) + return self.script + + class SpentScriptPubKey(object): """Represents the script portion of a spent Transaction output""" @@ -120,3 +172,9 @@ def extract_from_hex(cls, raw_hex): real_script_len = script_len_code - 6 # print("real_script_len: %d" % real_script_len) return (raw_hex[:script_len_code_len+real_script_len], real_script_len) + + @property + def script(self): + if not self.script: + self.script = decompress_script(self._raw_hex) + return self.script