Commit dd959b28 authored by Kirill Smelkov's avatar Kirill Smelkov

zodbdump += DumpReader - to read/parse zodbdump stream

We will likely need this reader for `zodb restore` in the future.
We will also use this reader for `zodb commit` in the next patch.

pygolang dependency v↑ becuase we use recently introduced
golang.strconv to unquote user/desc/extension strings.

Python2 works. Python3 support is only minimal and incomplete.
parent e973d519
......@@ -19,7 +19,7 @@ setup(
keywords = 'zodb utility tool',
packages = find_packages(),
install_requires = ['ZODB', 'zodburi', 'pygolang >= 0.0.0.dev3', 'six'],
install_requires = ['ZODB', 'zodburi', 'zope.interface', 'pygolang >= 0.0.0.dev6', 'six'],
extras_require = {
'test': ['pytest'],
......
# Copyright (C) 2017 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
# Copyright (C) 2017-2018 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
......@@ -17,12 +17,18 @@
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
from zodbtools.zodbdump import zodbdump
from zodbtools.zodbdump import (
zodbdump, DumpReader, Transaction, ObjectDelete, ObjectCopy,
ObjectData, HashOnly
)
from ZODB.FileStorage import FileStorage
from ZODB.utils import p64
from cStringIO import StringIO
from os.path import dirname
from pytest import raises
# verify zodbdump output against golden
def test_zodbdump():
tdir = dirname(__file__)
......@@ -35,3 +41,102 @@ def test_zodbdump():
zodbdump(stor, None, None, out=out)
assert out.getvalue() == dumpok
# verify zodbdump.DumpReader
def test_dumpreader():
in_ = b"""\
txn 0123456789abcdef " "
user "my name"
description "o la-la..."
extension "zzz123 def"
obj 0000000000000001 delete
obj 0000000000000002 from 0123456789abcdee
obj 0000000000000003 54 adler32:01234567 -
obj 0000000000000004 4 sha1:9865d483bc5a94f2e30056fc256ed3066af54d04
ZZZZ
obj 0000000000000005 9 crc32:52fdeac5
ABC
DEF!
txn 0123456789abcdf0 " "
user "author2"
description "zzz"
extension "qqq"
"""
r = DumpReader(StringIO(in_))
t1 = r.readtxn()
assert isinstance(t1, Transaction)
assert t1.tid == '0123456789abcdef'.decode('hex')
assert t1.user == b'my name'
assert t1.description == b'o la-la...'
assert t1.extension_bytes == b'zzz123 def'
assert len(t1.objv) == 5
_ = t1.objv[0]
assert isinstance(_, ObjectDelete)
assert _.oid == p64(1)
_ = t1.objv[1]
assert isinstance(_, ObjectCopy)
assert _.oid == p64(2)
assert _.copy_from == '0123456789abcdee'.decode('hex')
_ = t1.objv[2]
assert isinstance(_, ObjectData)
assert _.oid == p64(3)
assert _.data == HashOnly(54)
assert _.hashfunc == 'adler32'
assert _.hash_ == '01234567'.decode('hex')
_ = t1.objv[3]
assert isinstance(_, ObjectData)
assert _.oid == p64(4)
assert _.data == b'ZZZZ'
assert _.hashfunc == 'sha1'
assert _.hash_ == '9865d483bc5a94f2e30056fc256ed3066af54d04'.decode('hex')
_ = t1.objv[4]
assert isinstance(_, ObjectData)
assert _.oid == p64(5)
assert _.data == b'ABC\n\nDEF!'
assert _.hashfunc == 'crc32'
assert _.hash_ == '52fdeac5'.decode('hex')
t2 = r.readtxn()
assert isinstance(t2, Transaction)
assert t2.tid == '0123456789abcdf0'.decode('hex')
assert t2.user == b'author2'
assert t2.description == b'zzz'
assert t2.extension_bytes == b'qqq'
assert t2.objv == []
assert r.readtxn() == None
z = ''.join([_.zdump() for _ in (t1, t2)])
assert z == in_
# unknown hash function
r = DumpReader(StringIO("""\
txn 0000000000000000 " "
user ""
description ""
extension ""
obj 0000000000000001 1 xyz:0123 -
"""))
with raises(RuntimeError) as exc:
r.readtxn()
assert exc.value.args == ("""+5: invalid line: unknown hash function "xyz" ('obj 0000000000000001 1 xyz:0123 -')""",)
# data integrity error
r = DumpReader(StringIO("""\
txn 0000000000000000 " "
user ""
description ""
extension ""
obj 0000000000000001 5 crc32:01234567
hello
"""))
with raises(RuntimeError) as exc:
r.readtxn()
assert exc.value.args == ("""+6: data corrupt: crc32 = 3610a686, expected 01234567""",)
......@@ -18,7 +18,7 @@
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
import hashlib, struct
import hashlib, struct, codecs
import zodburi
from six.moves.urllib_parse import urlsplit, urlunsplit
from zlib import crc32, adler32
......@@ -26,6 +26,9 @@ from zlib import crc32, adler32
def ashex(s):
return s.encode('hex')
def fromhex(s):
return codecs.decode(s, 'hex')
def sha1(data):
m = hashlib.sha1()
m.update(data)
......
# Copyright (C) 2016-2017 Nexedi SA and Contributors.
# Copyright (C) 2016-2018 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
......@@ -53,15 +53,19 @@ TODO also protect txn record by hash.
"""
from __future__ import print_function
from zodbtools.util import ashex, sha1, txnobjv, parse_tidrange, TidRangeInvalid, \
storageFromURL
from zodbtools.util import ashex, fromhex, sha1, txnobjv, parse_tidrange, TidRangeInvalid, \
storageFromURL, hashRegistry
from ZODB._compat import loads, _protocol, BytesIO
from zodbpickle.slowpickle import Pickler as pyPickler
#import pickletools
from ZODB.interfaces import IStorageTransactionMetaData
from zope.interface import implementer
import sys
import logging
import re
from golang.gcompat import qq
from golang import strconv
# txn_raw_extension returns raw extension from txn metadata
def txn_raw_extension(stor, txn):
......@@ -271,3 +275,244 @@ def main(argv):
stor = storageFromURL(storurl, read_only=True)
zodbdump(stor, tidmin, tidmax, hashonly)
# ----------------------------------------
# dump reading/parsing
_txn_re = re.compile(b'^txn (?P<tid>[0-9a-f]{16}) "(?P<status>.)"$')
_obj_re = re.compile(b'^obj (?P<oid>[0-9a-f]{16}) ((?P<delete>delete)|from (?P<from>[0-9a-f]{16})|(?P<size>[0-9]+) (?P<hashfunc>\w+):(?P<hash>[0-9a-f]+)(?P<hashonly> -)?)')
# _ioname returns name of the reader r, if it has one.
# if there is no name - '' is returned.
def _ioname(r):
return getattr(r, 'name', '')
# DumpReader wraps IO reader to read transactions from zodbdump stream.
#
# The reader must provide .readline() and .read() methods.
# The reader must be opened in binary mode.
class DumpReader(object):
# .lineno - line number position in read stream
def __init__(self, r):
self._r = r
self._line = None # last read line
self.lineno = 0
def _readline(self):
l = self._r.readline()
if l == '':
self._line = None
return None # EOF
l = l.rstrip(b'\n')
self.lineno += 1
self._line = l
return l
# report a problem found around currently-read line
def _badline(self, msg):
raise RuntimeError("%s+%d: invalid line: %s (%r)" % (_ioname(self._r), self.lineno, msg, self._line))
# readtxn reads one transaction record from input stream and returns
# Transaction instance or None at EOF.
def readtxn(self):
# header
l = self._readline()
if l is None:
return None
m = _txn_re.match(l)
if m is None:
self._badline('no txn start')
tid = fromhex(m.group('tid'))
status = m.group('status')
def get(name):
l = self._readline()
if l is None or not l.startswith(b'%s ' % name):
self._badline('no %s' % name)
return strconv.unquote(l[len(name) + 1:])
user = get(b'user')
description = get(b'description')
extension = get(b'extension')
# objects
objv = []
while 1:
l = self._readline()
if l == '':
break # empty line - end of transaction
if l is None or not l.startswith(b'obj '):
self._badline('no obj')
m = _obj_re.match(l)
if m is None:
self._badline('invalid obj entry')
obj = None # will be Object*
oid = fromhex(m.group('oid'))
from_ = m.group('from')
if m.group('delete'):
obj = ObjectDelete(oid)
elif from_:
copy_from = fromhex(from_)
obj = ObjectCopy(oid, copy_from)
else:
size = int(m.group('size'))
hashfunc = m.group('hashfunc')
hashok = fromhex(m.group('hash'))
hashonly = m.group('hashonly') is not None
data = None # see vvv
hcls = hashRegistry.get(hashfunc)
if hcls is None:
self._badline('unknown hash function %s' % qq(hashfunc))
if hashonly:
data = HashOnly(size)
else:
# XXX -> io.readfull
n = size+1 # data LF
data = b''
while n > 0:
chunk = self._r.read(n)
data += chunk
n -= len(chunk)
self.lineno += data.count('\n')
self._line = None
if data[-1:] != b'\n':
raise RuntimeError('%s+%d: no LF after obj data' % (_ioname(self._r), self.lineno))
data = data[:-1]
# verify data integrity
# TODO option to allow reading corrupted data
h = hcls()
h.update(data)
hash_ = h.digest()
if hash_ != hashok:
raise RuntimeError('%s+%d: data corrupt: %s = %s, expected %s' % (
_ioname(self._r), self.lineno, h.name, ashex(hash_), ashex(hashok)))
obj = ObjectData(oid, data, hashfunc, hashok)
objv.append(obj)
return Transaction(tid, status, user, description, extension, objv)
# Transaction represents one transaction record in zodbdump stream.
@implementer(IStorageTransactionMetaData)
class Transaction(object):
# .tid p64 transaction ID
# .status char status of the transaction
# .user bytes transaction author
# .description bytes transaction description
# .extension_bytes bytes transaction extension
# .objv []Object* objects changed by transaction
def __init__(self, tid, status, user, description, extension, objv):
self.tid = tid
self.status = status
self.user = user
self.description = description
self.extension_bytes = extension
self.objv = objv
# ZODB wants to work with extension as {} - try to convert it on the fly.
#
# The conversion can fail for arbitrary .extension_bytes input.
# The conversion should become not needed once
#
# https://github.com/zopefoundation/ZODB/pull/183, or
# https://github.com/zopefoundation/ZODB/pull/207
#
# is in ZODB.
@property
def extension(self):
if not self.extension_bytes:
return {}
return loads(self.extension_bytes)
# zdump returns text representation of a record in zodbdump format.
def zdump(self):
z = 'txn %s %s\n' % (ashex(self.tid), qq(self.status))
z += 'user %s\n' % qq(self.user)
z += 'description %s\n' % qq(self.description)
z += 'extension %s\n' % qq(self.extension_bytes)
for obj in self.objv:
z += obj.zdump()
z += '\n'
return z
# Object is base class for object records in zodbdump stream.
class Object(object):
# .oid p64 object ID
def __init__(self, oid):
self.oid = oid
# ObjectDelete represents objects deletion.
class ObjectDelete(Object):
def __init__(self, oid):
super(ObjectDelete, self).__init__(oid)
def zdump(self):
return 'obj %s delete\n' % (ashex(self.oid))
# ObjectCopy represents object data copy.
class ObjectCopy(Object):
# .copy_from tid copy object data from object's revision tid
def __init__(self, oid, copy_from):
super(ObjectCopy, self).__init__(oid)
self.copy_from = copy_from
def zdump(self):
return 'obj %s from %s\n' % (ashex(self.oid), ashex(self.copy_from))
# ObjectData represents record with object data.
class ObjectData(Object):
# .data HashOnly | bytes
# .hashfunc str hash function used for integrity
# .hash_ bytes hash of the object's data
def __init__(self, oid, data, hashfunc, hash_):
super(ObjectData, self).__init__(oid)
self.data = data
self.hashfunc = hashfunc
self.hash_ = hash_
def zdump(self):
data = self.data
hashonly = isinstance(data, HashOnly)
if hashonly:
size = data.size
else:
size = len(data)
z = 'obj %s %d %s:%s' % (ashex(self.oid), size, self.hashfunc, ashex(self.hash_))
if hashonly:
z += ' -'
else:
z += '\n'
z += data
z += '\n'
return z
# HashOnly indicated that this ObjectData record contains only hash and does not contain object data.
class HashOnly(object):
# .size int
def __init__(self, size):
self.size = size
def __repr__(self):
return 'HashOnly(%d)' % self.size
def __eq__(a, b):
return isinstance(b, HashOnly) and a.size == b.size
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment