diff --git a/blockchain_parser/block.py b/blockchain_parser/block.py index 15caf22..7054361 100644 --- a/blockchain_parser/block.py +++ b/blockchain_parser/block.py @@ -11,7 +11,7 @@ from .transaction import Transaction from .block_header import BlockHeader -from .utils import format_hash, decode_varint, double_sha256 +from .utils import format_hash, decode_compactsize, double_sha256 def get_block_transactions(raw_hex): @@ -23,7 +23,7 @@ def get_block_transactions(raw_hex): # Decoding the number of transactions, offset is the size of # the varint (1 to 9 bytes) - n_transactions, offset = decode_varint(transaction_data) + n_transactions, offset = decode_compactsize(transaction_data) for i in range(n_transactions): # Try from 1024 (1KiB) -> 1073741824 (1GiB) slice widths @@ -78,7 +78,7 @@ def n_transactions(self): as there's no need to parse all transactions to get this information """ if self._n_transactions is None: - self._n_transactions = decode_varint(self.hex[80:])[0] + self._n_transactions = decode_compactsize(self.hex[80:])[0] return self._n_transactions diff --git a/blockchain_parser/blockchain.py b/blockchain_parser/blockchain.py index c4d24be..0916a77 100644 --- a/blockchain_parser/blockchain.py +++ b/blockchain_parser/blockchain.py @@ -42,10 +42,22 @@ def get_files(path): files = map(lambda x: os.path.join(path, x), files) return sorted(files) +def get_undo_files(path): + """ + Given the path to the .bitcoin directory, returns the sorted list of rev*.dat + files contained in that directory + """ + if not stat.S_ISDIR(os.stat(path)[stat.ST_MODE]): + return [path] + files = os.listdir(path) + files = [f for f in files if f.startswith("rev") and f.endswith(".dat")] + files = map(lambda x: os.path.join(path, x), files) + return sorted(files) + def get_blocks(blockfile): """ - Given the name of a .blk file, for every block contained in the file, + Given the name of a .dat file, for every block contained in the file, yields its raw hexadecimal value """ with open(blockfile, "rb") as f: diff --git a/blockchain_parser/input.py b/blockchain_parser/input.py index 564d9ae..09d974b 100644 --- a/blockchain_parser/input.py +++ b/blockchain_parser/input.py @@ -9,7 +9,7 @@ # modified, propagated, or distributed except according to the terms contained # in the LICENSE file. -from .utils import decode_varint, decode_uint32, format_hash +from .utils import decode_compactsize, decode_uint32, format_hash from .script import Script @@ -23,7 +23,7 @@ def __init__(self, raw_hex): self._sequence_number = None self._witnesses = [] - self._script_length, varint_length = decode_varint(raw_hex[36:]) + self._script_length, varint_length = decode_compactsize(raw_hex[36:]) self._script_start = 36 + varint_length self.size = self._script_start + self._script_length + 4 diff --git a/blockchain_parser/output.py b/blockchain_parser/output.py index 5ef1a58..894fc81 100644 --- a/blockchain_parser/output.py +++ b/blockchain_parser/output.py @@ -9,7 +9,7 @@ # modified, propagated, or distributed except according to the terms contained # in the LICENSE file. -from .utils import decode_varint, decode_uint64 +from .utils import decode_compactsize, decode_uint64 from .script import Script from .address import Address @@ -22,7 +22,7 @@ def __init__(self, raw_hex): self._script = None self._addresses = None - script_length, varint_size = decode_varint(raw_hex[8:]) + script_length, varint_size = decode_compactsize(raw_hex[8:]) script_start = 8 + varint_size self._script_hex = raw_hex[script_start:script_start+script_length] diff --git a/blockchain_parser/tests/test_utils.py b/blockchain_parser/tests/test_utils.py index e70cb25..9acde55 100644 --- a/blockchain_parser/tests/test_utils.py +++ b/blockchain_parser/tests/test_utils.py @@ -44,12 +44,12 @@ def test_decode_uint64(self): for uint64, value in uint64_dict.items(): self.assertEqual(utils.decode_uint64(a2b_hex(uint64)), value) - def test_decode_varint(self): + def test_decode_compactsize(self): case1 = a2b_hex("fa") - self.assertEqual(utils.decode_varint(case1), (250, 1)) + self.assertEqual(utils.decode_compactsize(case1), (250, 1)) case2 = a2b_hex("fd0100") - self.assertEqual(utils.decode_varint(case2), (1, 3)) + self.assertEqual(utils.decode_compactsize(case2), (1, 3)) case3 = a2b_hex("fe01000000") - self.assertEqual(utils.decode_varint(case3), (1, 5)) + self.assertEqual(utils.decode_compactsize(case3), (1, 5)) case4 = a2b_hex("ff0100000000000000") - self.assertEqual(utils.decode_varint(case4), (1, 9)) + self.assertEqual(utils.decode_compactsize(case4), (1, 9)) diff --git a/blockchain_parser/transaction.py b/blockchain_parser/transaction.py index 9277882..9dd8a79 100644 --- a/blockchain_parser/transaction.py +++ b/blockchain_parser/transaction.py @@ -11,7 +11,7 @@ from math import ceil -from .utils import decode_varint, decode_uint32, double_sha256, format_hash +from .utils import decode_compactsize, decode_uint32, double_sha256, format_hash from .input import Input from .output import Output @@ -44,7 +44,7 @@ def __init__(self, raw_hex): self.is_segwit = True offset += 2 - self.n_inputs, varint_size = decode_varint(raw_hex[offset:]) + self.n_inputs, varint_size = decode_compactsize(raw_hex[offset:]) offset += varint_size self.inputs = [] @@ -53,7 +53,7 @@ def __init__(self, raw_hex): offset += input.size self.inputs.append(input) - self.n_outputs, varint_size = decode_varint(raw_hex[offset:]) + self.n_outputs, varint_size = decode_compactsize(raw_hex[offset:]) offset += varint_size self.outputs = [] @@ -65,10 +65,10 @@ def __init__(self, raw_hex): if self.is_segwit: self._offset_before_tx_witnesses = offset for inp in self.inputs: - tx_witnesses_n, varint_size = decode_varint(raw_hex[offset:]) + tx_witnesses_n, varint_size = decode_compactsize(raw_hex[offset:]) offset += varint_size for j in range(tx_witnesses_n): - component_length, varint_size = decode_varint( + component_length, varint_size = decode_compactsize( raw_hex[offset:]) offset += varint_size witness = raw_hex[offset:offset + component_length] diff --git a/blockchain_parser/undo.py b/blockchain_parser/undo.py new file mode 100644 index 0000000..8c0f169 --- /dev/null +++ b/blockchain_parser/undo.py @@ -0,0 +1,180 @@ +# Copyright (C) 2015-2020 The bitcoin-blockchain-parser developers +# +# This file is part of bitcoin-blockchain-parser. +# +# It is subject to the license terms in the LICENSE file found in the top-level +# directory of this distribution. +# +# No part of bitcoin-blockchain-parser, including this file, may be copied, +# modified, propagated, or distributed except according to the terms contained +# in the LICENSE file. + +from .utils import decode_varint, decode_compactsize, decompress_txout_amt + +def decompress_script(raw_hex): + script_type = raw_hex[0] + compressed_script = raw_hex[1:] + + # def decompress_script(compressed_script, script_type): + """ Takes CScript as stored in leveldb and returns it in uncompressed form + (de)compression scheme is defined in bitcoin/src/compressor.cpp + :param compressed_script: raw script bytes hexlified (data in decode_utxo) + :type compressed_script: str + :param script_type: first byte of script data (out_type in decode_utxo) + :type script_type: int + :return: the decompressed CScript + :rtype: str + (this code adapted from https://github.com/sr-gi/bitcoin_tools) + """ + + if script_type == 0: + if len(compressed_script) != 20: + raise Exception("Compressed script has wrong size") + script = OutputScript.P2PKH(compressed_script, hash160=True) + + elif script_type == 1: + if len(compressed_script) != 20: + raise Exception("Compressed script has wrong size") + script = OutputScript.P2SH(compressed_script) + + elif script_type in [2, 3]: + if len(compressed_script) != 33: + raise Exception("Compressed script has wrong size") + script = OutputScript.P2PK(compressed_script) + + elif script_type in [4, 5]: + if len(compressed_script) != 33: + raise Exception("Compressed script has wrong size") + prefix = format(script_type - 2, '02') + script = OutputScript.P2PK(get_uncompressed_pk(prefix + compressed_script[2:])) + + else: + assert len(compressed_script) / 2 == script_type - NSPECIALSCRIPTS + script = OutputScript.from_hex(compressed_script) + + return script.content + + +class BlockUndo(object): + """ + Represents a block of spent transaction outputs (coins), as encoded + in the undo rev*.dat files + """ + def __init__(self, raw_hex): + self._raw_hex = raw_hex + self.spends = [] + num_txs, pos = decode_compactsize(raw_hex) + # print("found %d" % num_txs + " transactions") + for i in range(num_txs): + # print("calling SpentOutput with raw_hex %s", raw_hex) + txn = SpentTransaction(raw_hex=raw_hex[pos:]) + self.spends.append(txn) + # print("found transaction #%d length %d hex: " % (i, txn.len), raw_hex[pos:pos+txn.len].hex()) + pos += txn.len + + +class SpentTransaction(object): + """Represents the script portion of a spent Transaction output""" + def __init__(self, raw_hex=None): + self._raw_hex = raw_hex + self.outputs = [] + # print("decoding compactsize for hex: ", raw_hex.hex()) + self.output_len, pos = decode_compactsize(raw_hex) + # print("found %d" % self.output_len + " outputs") + for i in range(self.output_len): + output = SpentOutput(raw_hex=raw_hex[pos:]) + self.outputs.append(output) + # print("found output #%d length %d hex: " % (i, output.len), raw_hex[pos:pos+output.len].hex()) + pos += output.len + self.len = pos + + @classmethod + def from_hex(cls, hex_): + return cls(hex_) + + +class SpentOutput(object): + """Represents a spent Transaction output""" + + def __init__(self, raw_hex=None): + # print("decoding output: ", raw_hex.hex()) + self._raw_hex = raw_hex + pos = 0 + # self.version = raw_hex[pos] + # pos += 1 + + # decode height code + height_code, height_code_len = decode_varint(raw_hex[pos:]) + # print("found height code : ", height_code, height_code_len) + if height_code % 2 == 1: + self.is_coinbase = True + height_code -= 1 + else: + self.is_coinbase = False + self.height = height_code // 2 + + # print("found height: ", self.height) + + # skip byte reserved only for backwards compatibility, should always be 0x00 + pos += height_code_len + 1 + + # decode compressed txout amount + compressed_amt, compressed_amt_len = decode_varint(raw_hex[pos:]) + self.amt = decompress_txout_amt(compressed_amt) + pos += compressed_amt_len + + # get script + script_hex, script_pub_key_compressed_len = SpentScriptPubKey.extract_from_hex(raw_hex[pos:]) + self.script_pub_key_compressed = SpentScriptPubKey(script_hex) + self.len = pos + self.script_pub_key_compressed.len + + @classmethod + def from_hex(cls, hex_): + return cls(hex_) + + + @property + def script(self): + if not self.script: + self.script = decompress_script(self.script_pub_key_compressed) + return self.script + + + +class SpentScriptPubKey(object): + """Represents the script portion of a spent Transaction output""" + def __init__(self, raw_hex=None): + self._raw_hex = raw_hex + self.len = len(raw_hex) + # self.script_hex = raw_hex[1:] + + @classmethod + def from_hex(cls, hex_): + return cls(hex_) + + @classmethod + def extract_from_hex(cls, raw_hex): + """ + docstring + """ + if raw_hex[0] in (0x00, 0x01): + return (raw_hex[:21], 21) + elif raw_hex[0] in (0x02, 0x03): + return (raw_hex[:33], 33) + elif raw_hex[0] in (0x04, 0x05): + # print("found strange script type: ", raw_hex[0]) + return (raw_hex[:33], 33) + else: + # print("found strange script type: ", raw_hex[0]) + # print("decoding compactsize for raw hex: ", raw_hex.hex()) + script_len_code, script_len_code_len = decode_varint(raw_hex) + # print("script_len_code, script_len_code_len: (%s, %s)" % (script_len_code, script_len_code_len)) + real_script_len = script_len_code - 6 + # print("real_script_len: %d" % real_script_len) + return (raw_hex[:script_len_code_len+real_script_len], real_script_len) + + @property + def script(self): + if not self.script: + self.script = decompress_script(self._raw_hex) + return self.script diff --git a/blockchain_parser/utils.py b/blockchain_parser/utils.py index 18ce8d8..e54dd51 100644 --- a/blockchain_parser/utils.py +++ b/blockchain_parser/utils.py @@ -39,7 +39,7 @@ def decode_uint64(data): return struct.unpack(" 0) size = int(data[0]) assert(size <= 255) @@ -59,3 +59,87 @@ def decode_varint(data): size = struct.calcsize(format_) return struct.unpack(format_, data[1:size+1])[0], size + 1 + + +def decode_varint(raw_hex): + """ + Reads the weird format of VarInt present in src/serialize.h of bitcoin core + and being used for storing data in the leveldb. + This is not the VARINT format described for general bitcoin serialization + use. + """ + n = 0 + pos = 0 + while True: + try: + data = raw_hex[pos] + except IndexError as e: + print("IndexError caught on raw_hex: ", raw_hex, e) + raise e + pos += 1 + n = (n << 7) | (data & 0x7f) + if data & 0x80 == 0: + return n, pos + n += 1 + + +def decompress_txout_amt(amount_compressed_int): + # (this function stolen from https://github.com/sr-gi/bitcoin_tools and modified to remove bug) + # No need to do any work if it's zero. + if amount_compressed_int == 0: + return 0 + + # The decompressed amount is either of the following two equations: + # x = 1 + 10*(9*n + d - 1) + e + # x = 1 + 10*(n - 1) + 9 + amount_compressed_int -= 1 + + # The decompressed amount is now one of the following two equations: + # x = 10*(9*n + d - 1) + e + # x = 10*(n - 1) + 9 + exponent = amount_compressed_int % 10 + + # integer division + amount_compressed_int //= 10 + + # The decompressed amount is now one of the following two equations: + # x = 9*n + d - 1 | where e < 9 + # x = n - 1 | where e = 9 + n = 0 + if exponent < 9: + lastDigit = amount_compressed_int%9 + 1 + # integer division + amount_compressed_int //= 9 + n = amount_compressed_int*10 + lastDigit + else: + n = amount_compressed_int + 1 + + # Apply the exponent. + return n * 10**exponent + + +def compress_txout_amt(n): + """ Compresses the Satoshi amount of a UTXO to be stored in the LevelDB. Code is a port from the Bitcoin Core C++ + source: + https://github.com/bitcoin/bitcoin/blob/v0.13.2/src/compressor.cpp#L133#L160 + :param n: Satoshi amount to be compressed. + :type n: int + :return: The compressed amount of Satoshis. + :rtype: int + (this function stolen from https://github.com/sr-gi/bitcoin_tools and modified to remove bug) + """ + + if n == 0: + return 0 + e = 0 + while ((n % 10) == 0) and e < 9: + n //= 10 + e += 1 + + if e < 9: + d = (n % 10) + assert (1 <= d <= 9) + n //= 10 + return 1 + (n * 9 + d - 1) * 10 + e + else: + return 1 + (n - 1) * 10 + 9 diff --git a/examples/ordered-blocks.py b/examples/ordered-blocks.py index 061e804..426c9d4 100644 --- a/examples/ordered-blocks.py +++ b/examples/ordered-blocks.py @@ -10,4 +10,4 @@ # `index` directory (LevelDB index) being maintained by bitcoind. It contains # .ldb files and is present inside the `blocks` directory for block in blockchain.get_ordered_blocks(sys.argv[1] + '/index', end=1000): - print("height=%d block=%s" % (block.height, block.hash)) \ No newline at end of file + print("height=%d block=%s" % (block.height, block.hash)) diff --git a/examples/undo-blocks.py b/examples/undo-blocks.py new file mode 100644 index 0000000..34f7f88 --- /dev/null +++ b/examples/undo-blocks.py @@ -0,0 +1,15 @@ +import os +from blockchain_parser.blockchain import * +from blockchain_parser.blockchain import Blockchain +from blockchain_parser.utils import * +from blockchain_parser.undo import * + +undo_files = get_undo_files(os.path.expanduser('~/.bitcoin/blocks')) +undo_block_ctr = 0 +for i, file_name in enumerate(undo_files): + print("parsing undo file #%d" % i) + for j, block_raw in enumerate(get_blocks(file_name)): + undo_block_ctr += 1 + if j % 1000 == 0 or (i == 1 and j > 9000): + print("parsing undo block #%d in file #%d block #%d" % (undo_block_ctr, i, j)) + block_undo_current = BlockUndo(block_raw)