Commit 2801fae9 authored by Kirill Smelkov's avatar Kirill Smelkov

zodbdump: Start to stabilize format + add test

We start to stabilize output format of `zodb dump`. It is actually now robust and the only thing I would contemplate to potentially change is to also cover transaction metadata by hash checksum. So please take a look at updated format (details in patch 1) to provide feedback because it is likely close to what it  will be in its final form.

We also add a program to generate test database which uses various fancy ZODB features and check `zodb dump` output on it to golden one (patch 3).

To be able to dump transaction metadata in raw form ZODB is patched a bit:

https://github.com/zopefoundation/ZODB/pull/183

and we try to detect whether appropriate support is there at runtime and if yes use it to streamline obtaining transaction extension as raw (patch 2).
Pleae see patch 1 (second half of `zodbdump.py` about what has to be taken on without such support and that it still can't work fully reliably).

/cc @nexedi
/reviewed-on nexedi/zodbtools!3
parents 79cf177a 7f0bbf7e
...@@ -21,6 +21,10 @@ setup( ...@@ -21,6 +21,10 @@ setup(
packages = find_packages(), packages = find_packages(),
install_requires = ['ZODB', 'zodburi', 'six'], install_requires = ['ZODB', 'zodburi', 'six'],
extras_require = {
'test': ['pytest'],
},
entry_points= {'console_scripts': ['zodb = zodbtools.zodb:main']}, entry_points= {'console_scripts': ['zodb = zodbtools.zodb:main']},
classifiers = [_.strip() for _ in """\ classifiers = [_.strip() for _ in """\
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2017 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""generate reference database and index for tests"""
# NOTE result of this script must be saved in version control and should not be
# generated at the time when tests are run. This is because even though we make
# time and random predictable ZODB cannot generally save same transaction
# extension dictionary to the same raw data.
#
# Quoting
#
# https://docs.python.org/2.7/library/stdtypes.html#dict.items and
# https://docs.python.org/3.7/library/stdtypes.html#dictionary-view-objects
#
# """ CPython implementation detail: Keys and values are listed in an arbitrary
# order which is non-random, varies across Python implementations, and depends
# on the dictionary’s history of insertions and deletions. """
# NOTE as of 14 Mar 2017 FileStorage cannot commit transactions with non-ASCII
# metadata - so it is not tested
from ZODB.FileStorage import FileStorage
from ZODB import DB
from ZODB.POSException import UndoError
from persistent import Persistent
import transaction
import sys
import struct
import time
import random
import logging
# convert numeric oid to/from str
def p64(num):
return struct.pack('>Q', num)
def unpack64(packed):
return struct.unpack('>Q', packed)[0]
def hex64(packed):
return '0x%016x' % unpack64(packed)
# make time.time() predictable
_xtime = time.mktime(time.strptime("04 Jan 1979", "%d %b %Y"))
def xtime():
global _xtime
_xtime += 1.1
return _xtime
time.time = xtime
# prepare transaction for a commit
def precommit(user, description, extension):
txn = transaction.get()
txn.user = user
txn.description = description
txn.extension = extension
return txn
def commit(user, description, extension):
txn = precommit(user, description, extension)
txn.commit()
class Object(Persistent):
# .value
def __init__(self, value):
self.value = value
def __getstate__(self):
return self.value
def __setstate__(self, state):
self.value = state
# prepare extension dictionary for subject
alnum = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
def ext(subj):
d = {"x-generator": "zodb/py%s (%s)" % (sys.version_info.major, subj)}
# also add some random 'x-cookie'
cooklen = 5
cookie = ""
for _ in range(cooklen):
cookie += random.choice(alnum)
xcookie = "x-cookie" + random.choice(alnum)
d[xcookie] = cookie
# shufle extension dict randomly - to likely trigger different ordering on save
keyv = d.keys()
random.shuffle(keyv)
ext = {}
for key in keyv:
ext[key] = d[key]
return ext
# gen_testdb generates test FileStorage database @ outfs_path
def gen_testdb(outfs_path):
logging.basicConfig()
# generate random changes to objects hooked to top-level root by a/b/c/... key
random.seed(0)
namev = [_ for _ in "abcdefg"]
Niter = 2
for i in range(Niter):
stor = FileStorage(outfs_path, create=(i == 0))
db = DB(stor)
conn = db.open()
root = conn.root()
assert root._p_oid == p64(0), repr(root._p_oid)
for j in range(25):
name = random.choice(namev)
if name in root:
obj = root[name]
else:
root[name] = obj = Object(None)
obj.value = "%s%i.%i" % (name, i, j)
commit(u"user%i.%i" % (i,j), u"step %i.%i" % (i, j), ext(name))
# undo a transaction one step before a latest one a couple of times
for j in range(2):
# XXX undoLog, despite what its interface says:
# https://github.com/zopefoundation/ZODB/blob/2490ae09/src/ZODB/interfaces.py#L472
# just returns log of all transactions in specified range:
# https://github.com/zopefoundation/ZODB/blob/2490ae09/src/ZODB/FileStorage/FileStorage.py#L1008
# https://github.com/zopefoundation/ZODB/blob/2490ae09/src/ZODB/FileStorage/FileStorage.py#L2103
# so we retry undoing next log's txn on conflict.
for ul in db.undoLog(1, 20):
try:
db.undo(ul["id"])
commit(u"root%i.%i\nYour\nMagesty " % (i, j),
u"undo %i.%i\nmore detailed description\n\nzzz ..." % (i, j) + "\t"*(i+j),
ext("undo %s" % ul["id"]))
except UndoError:
transaction.abort()
continue
break
# delete an object
name = random.choice(root.keys())
obj = root[name]
root[name] = Object("%s%i*" % (name, i))
# NOTE user/ext are kept empty on purpose - to also test this case
commit(u"", u"predelete %s" % unpack64(obj._p_oid), {})
# XXX obj in db could be changed by above undo, but ZODB does not automatically
# propagate undo changes to live objects - so obj._p_serial can be stale.
# Get serial via history.
obj_tid_lastchange = db.history(obj._p_oid)[0]['tid']
txn = precommit(u"root%i\nYour\nRoyal\nMagesty' " % i +
''.join(chr(_) for _ in range(32)), # <- NOTE all control characters
u"delete %i\nalpha beta gamma'delta\"lambda\n\nqqq ..." % i,
ext("delete %s" % unpack64(obj._p_oid)))
stor.tpc_begin(txn)
stor.deleteObject(obj._p_oid, obj_tid_lastchange, txn)
stor.tpc_vote(txn)
# TODO different txn status vvv
# XXX vvv it does the thing, but py fs iterator treats this txn as EOF
#if i != Niter-1:
# stor.tpc_finish(txn)
stor.tpc_finish(txn)
# close db & rest not to get conflict errors after we touched stor
# directly a bit. everything will be reopened on next iteration.
conn.close()
db.close()
stor.close()
# ----------------------------------------
from zodbtools.zodbdump import zodbdump
def main():
out = "testdata/1"
gen_testdb("%s.fs" % out)
stor = FileStorage("%s.fs" % out, read_only=True)
with open("%s.zdump.ok" % out, "w") as f:
zodbdump(stor, None, None, out=f)
if __name__ == '__main__':
main()
# Copyright (C) 2017 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
from zodbtools.zodbdump import zodbdump
from ZODB.FileStorage import FileStorage
from cStringIO import StringIO
from os.path import dirname
# verify zodbdump output against golden
def test_zodbdump():
tdir = dirname(__file__)
stor = FileStorage('%s/testdata/1.fs' % tdir, read_only=True)
with open('%s/testdata/1.zdump.ok' % tdir) as f:
dumpok = f.read()
out = StringIO()
zodbdump(stor, None, None, out=out)
assert out.getvalue() == dumpok
# -*- coding: utf-8 -*-
# Copyright (C) 2017 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
from zodbtools.util import escapeqq
def test_escapeqq():
testv = (
# in want without leading/trailing "
('', r""),
('\'', r"'"),
('"', r"\""),
('abc\ndef', r"abc\ndef"),
('a\'c\ndef', r"a'c\ndef"),
('a\"c\ndef', r"a\"c\ndef"),
# ('привет', r"привет"), TODO
)
for tin, twant in testv:
twant = '"' + twant + '"' # add lead/trail "
assert escapeqq(tin) == twant
...@@ -36,6 +36,23 @@ class Inf: ...@@ -36,6 +36,23 @@ class Inf:
return +1 return +1
inf = Inf() inf = Inf()
# escapeqq escapes string into valid "..." string always quoted with ".
#
# (python's automatic escape uses smartquotes quoting with either ' or ")
#
# TODO also accept unicode as input.
# TODO output printable UTF-8 characters as-is, but escape non-printable UTF-8 and invalid UTF-8 bytes.
def escapeqq(s):
outv = []
# we don't want ' to be escaped
for _ in s.split("'"):
# this escape almost everything except " character
# NOTE string_escape does not do smartquotes and always uses ' for quoting
# (repr(str) is the same except it does smartquoting picking ' or " automatically)
q = _.encode("string_escape")
q = q.replace('"', r'\"')
outv.append(q)
return '"' + "'".join(outv) + '"'
# get next item from iter -> (item, !stop) # get next item from iter -> (item, !stop)
def nextitem(it): def nextitem(it):
......
...@@ -18,55 +18,209 @@ ...@@ -18,55 +18,209 @@
# See https://www.nexedi.com/licensing for rationale and options. # See https://www.nexedi.com/licensing for rationale and options.
"""Zodbdump - Tool to dump content of a ZODB database """Zodbdump - Tool to dump content of a ZODB database
TODO format (WARNING dump format is not yet stable) This program dumps content of a ZODB database.
It uses ZODB Storage iteration API to get list of transactions and for every
transaction prints transaction's header and information about changed objects.
txn <tid> (<status>) The information dumped is complete raw information as stored in ZODB storage
user <user|encode?> and should be suitable for restoring the database from the dump file bit-to-bit
description <description|encode?> identical to its original(*). It is dumped in semi text-binary format where
extension <extension|encode?> object data is output as raw binary and everything else is text.
obj <oid> (delete | from <tid> | <sha1> <size> (LF <content>)?) LF XXX do we really need back <tid>
---- // ----
LF
txn ...
There is also shortened mode activated via --hashonly where only hash of object
data is printed without content.
Dump format:
txn <tid> <status|quote>
user <user|quote>
description <description|quote>
extension <raw_extension|quote>
obj <oid> (delete | from <tid> | <size> <hashfunc>:<hash> (-|LF <raw-content>)) LF
obj ...
...
obj ...
LF
txn ...
quote: quote string with " with non-printable and control characters \-escaped
hashfunc: one of sha1, sha256, sha512 ...
(*) It is possible to obtain transaction metadata in raw form only in recent ZODB.
See https://github.com/zopefoundation/ZODB/pull/183 for details.
TODO also protect txn record by hash.
""" """
from __future__ import print_function from __future__ import print_function
from zodbtools.util import ashex, sha1, txnobjv, parse_tidrange, TidRangeInvalid, \ from zodbtools.util import ashex, sha1, txnobjv, parse_tidrange, TidRangeInvalid, \
storageFromURL storageFromURL, escapeqq
from ZODB._compat import loads, _protocol, BytesIO
from zodbpickle.slowpickle import Pickler as pyPickler
#import pickletools
import sys
import logging
# txn_raw_extension returns raw extension from txn metadata
def txn_raw_extension(stor, txn):
# if txn provides ZODB.interfaces.IStorageTransactionInformationRaw - use it directly
raw_extension = getattr(txn, "extension_bytes", None)
if raw_extension is not None:
return raw_extension
# otherwise do best effort to generate raw_extension from txn.extension
# in a rational way
stor_name = "(%s, %s)" % (type(stor).__name__, stor.getName())
if stor_name not in _already_warned_notxnraw:
logging.warn("%s: storage does not provide IStorageTransactionInformationRaw ...", stor_name)
logging.warn("... will do best-effort to dump pickles in stable order but this cannot be done 100% correctly")
logging.warn("... please upgrade your ZODB & storage: see https://github.com/zopefoundation/ZODB/pull/183 for details.")
_already_warned_notxnraw.add(stor_name)
return serializeext(txn.extension)
def zodbdump(stor, tidmin, tidmax, hashonly=False): # set of storage names already warned for not providing IStorageTransactionInformationRaw
_already_warned_notxnraw = set()
# zodbdump dumps content of a ZODB storage to a file.
# please see module doc-string for dump format and details
def zodbdump(stor, tidmin, tidmax, hashonly=False, out=sys.stdout):
first = True first = True
for txn in stor.iterator(tidmin, tidmax): for txn in stor.iterator(tidmin, tidmax):
if not first: vskip = "\n"
print() if first:
vskip = ""
first = False first = False
print('txn %s (%s)' % (ashex(txn.tid), txn.status)) # XXX .status not covered by IStorageTransactionInformation
print('user: %r' % (txn.user,)) # XXX encode # XXX but covered by BaseStorage.TransactionRecord
print('description:', txn.description) # XXX encode out.write("%stxn %s %s\nuser %s\ndescription %s\nextension %s\n" % (
print('extension:', txn.extension) # XXX dict, encode vskip, ashex(txn.tid), escapeqq(txn.status),
escapeqq(txn.user),
escapeqq(txn.description),
escapeqq(txn_raw_extension(stor, txn)) ))
objv = txnobjv(txn) objv = txnobjv(txn)
for obj in objv: for obj in objv:
entry = 'obj %s ' % ashex(obj.oid) entry = "obj %s " % ashex(obj.oid)
write_data = False
if obj.data is None: if obj.data is None:
entry += 'delete' entry += "delete"
# was undo and data taken from obj.data_txn # was undo and data taken from obj.data_txn
elif obj.data_txn is not None: elif obj.data_txn is not None:
entry += 'from %s' % ashex(obj.data_txn) entry += "from %s" % ashex(obj.data_txn)
else:
# XXX sha1 is hardcoded for now. Dump format allows other hashes.
entry += "%i sha1:%s" % (len(obj.data), ashex(sha1(obj.data)))
write_data = True
out.write(entry)
if write_data:
if hashonly:
out.write(" -")
else: else:
entry += '%s %i' % (ashex(sha1(obj.data)), len(obj.data)) out.write("\n")
if not hashonly: out.write(obj.data)
entry += '\n'
entry += obj.data out.write("\n")
# ----------------------------------------
# XPickler is Pickler that tries to save objects stably
# in other words dicts/sets/... are pickled with items emitted always in the same order.
#
# NOTE we order objects by regular python objects "<", and in general case
# python fallbacks to comparing objects by their addresses, so comparision
# result is not in general stable from run to run. The following program
# prints True/False randomly with p. 50%:
# ---- 8< ----
# from random import choice
# class A: pass
# class B: pass
# if choice([True, False]):
# a = A()
# b = B()
# else:
# b = B()
# a = A()
# print a < b
# ---- 8< ----
#
# ( related reference: https://pythonhosted.org/BTrees/#total-ordering-and-persistence )
#
# We are ok with this semi-working solution(*) because it is only a fallback:
# for proper zodbdump usage it is adviced for storage to provide
# IStorageTransactionInformationRaw with all raw metadata directly accessible.
#
# (*) but 100% working e.g. for keys = only strings or integers
#
# NOTE cannot use C pickler because hooking into internal machinery is not possible there.
class XPickler(pyPickler):
dispatch = pyPickler.dispatch.copy()
def save_dict(self, obj):
# original pickler emits items taken from obj.iteritems()
# let's prepare something with .iteritems() but emits those objs items ordered
items = obj.items()
items.sort() # sorts by key
xitems = asiteritems(items)
super(self, XPickler).save_dict(xitems)
def save_set(self, obj):
# set's reduce always return 3 values
# https://github.com/python/cpython/blob/309fb90f/Objects/setobject.c#L1954
typ, keyv, dict_ = obj.__reduce_ex__(self.proto)
keyv.sort()
rv = (typ, keyv, dict_)
self.save_reduce(obj=obj, *rv)
dispatch[set] = save_set
# asiteritems creates object that emits prepared items via .iteritems()
# see save_dict() above for why/where it is needed.
class asiteritems(object):
def __init__(self, items):
self._items = items
def iteritems(self):
return iter(self._items)
# serializeext canonically serializes transaction's metadata "extension" dict
def serializeext(ext):
# ZODB iteration API gives us depickled extensions and only that.
# So for dumping in raw form we need to pickle it back hopefully getting
# something close to original raw data.
print(entry) if not ext:
# ZODB usually does this: encode {} as empty "", not as "}."
# https://github.com/zopefoundation/ZODB/blob/2490ae09/src/ZODB/BaseStorage.py#L194
#
# and here are decoders:
# https://github.com/zopefoundation/ZODB/blob/2490ae09/src/ZODB/FileStorage/FileStorage.py#L1145
# https://github.com/zopefoundation/ZODB/blob/2490ae09/src/ZODB/FileStorage/FileStorage.py#L1990
# https://github.com/zopefoundation/ZODB/blob/2490ae09/src/ZODB/fstools.py#L66
# ...
return b""
buf = BytesIO()
p = XPickler(buf, _protocol)
p.dump(ext)
out = buf.getvalue()
#out = pickletools.optimize(out) # remove unneeded PUT opcodes
assert loads(out) == ext
return out
# ---------------------------------------- # ----------------------------------------
import sys, getopt import sys, getopt
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment