Commit d243caa3 authored by Jason Madden's avatar Jason Madden

Add a 'maxsize' parameter for imap_unordered to limit the number of buffered results. Fixes #638.

parent 770bb4a3
...@@ -33,6 +33,15 @@ Unreleased ...@@ -33,6 +33,15 @@ Unreleased
printed.) Reported in :issue:`617` by Jay Oster and Carlos Sanchez. printed.) Reported in :issue:`617` by Jay Oster and Carlos Sanchez.
- PyPy: Fix a ``TypeError`` from ``gevent.idle()``. Reported in - PyPy: Fix a ``TypeError`` from ``gevent.idle()``. Reported in
:issue:`639` by chilun2008. :issue:`639` by chilun2008.
- The ``imap_unordered`` methods of a pool support a ``maxsize``
parameter to limit the number of results buffered waiting for the
consumer. Reported in :issue:`638` by Sylvain Zimmer.
- The class ``gevent.queue.Queue`` now consistently orders multiple
blocked waiting ``put`` and ``get`` callers in the order they
arrived. Previously, due to an implementation quirk this was often
roughly the case under CPython, but not under PyPy. Now they both
behave the same.
- The class ``gevent.queue.Queue`` now supports the ``len()`` function.
.. _future: http://python-future.org .. _future: http://python-future.org
.. _bench_sendall.py: https://raw.githubusercontent.com/gevent/gevent/master/greentest/bench_sendall.py .. _bench_sendall.py: https://raw.githubusercontent.com/gevent/gevent/master/greentest/bench_sendall.py
......
...@@ -31,10 +31,27 @@ __all__ = ['Group', 'Pool'] ...@@ -31,10 +31,27 @@ __all__ = ['Group', 'Pool']
class IMapUnordered(Greenlet): class IMapUnordered(Greenlet):
"""
At iterator of map results.
"""
_zipped = False _zipped = False
_queue_max_size = None
def __init__(self, func, iterable, spawn=None, maxsize=None, _zipped=False):
"""
An iterator that.
def __init__(self, func, iterable, spawn=None, _zipped=False): :keyword int maxsize: If given and not-None, specifies the maximum number of
finished results that will be allowed to accumulated awaiting the reader;
more than that number of results will cause map function greenlets to begin
to block. This is most useful is there is a great disparity in the speed of
the mapping code and the consumer and the results consume a great deal of resources.
Using a bound is more computationally expensive than not using a bound.
.. versionchanged:: 1.1b3
Added the *maxsize* parameter.
"""
from gevent.queue import Queue from gevent.queue import Queue
Greenlet.__init__(self) Greenlet.__init__(self)
if spawn is not None: if spawn is not None:
...@@ -43,10 +60,20 @@ class IMapUnordered(Greenlet): ...@@ -43,10 +60,20 @@ class IMapUnordered(Greenlet):
self._zipped = _zipped self._zipped = _zipped
self.func = func self.func = func
self.iterable = iterable self.iterable = iterable
self.queue = Queue() self.queue = Queue(maxsize)
if maxsize:
self._queue_max_size = maxsize
self.count = 0 self.count = 0
self.finished = False self.finished = False
self.rawlink(self._on_finish) # If the queue size is unbounded, then we want to call all
# the links (_on_finish and _on_result) directly in the hub greenlet
# for efficiency. However, if the queue is bounded, we can't do that if
# the queue might block (because if there's no waiter the hub can switch to,
# the queue simply raises Full). Therefore, in that case, we use
# the safer, somewhat-slower (because it spawns a greenlet) link() methods.
# This means that _on_finish and _on_result can be called and interleaved in any order
# if the call to self.queue.put() blocks.
self.rawlink(self._on_finish) if not maxsize else self.link(self._on_finish)
def __iter__(self): def __iter__(self):
return self return self
...@@ -64,7 +91,7 @@ class IMapUnordered(Greenlet): ...@@ -64,7 +91,7 @@ class IMapUnordered(Greenlet):
def _ispawn(self, func, item): def _ispawn(self, func, item):
self.count += 1 self.count += 1
g = self.spawn(func, item) if not self._zipped else self.spawn(func, *item) g = self.spawn(func, item) if not self._zipped else self.spawn(func, *item)
g.rawlink(self._on_result) g.rawlink(self._on_result) if not self._queue_max_size else g.link(self._on_result)
return g return g
def _run(self): def _run(self):
...@@ -78,28 +105,42 @@ class IMapUnordered(Greenlet): ...@@ -78,28 +105,42 @@ class IMapUnordered(Greenlet):
self.__dict__.pop('iterable', None) self.__dict__.pop('iterable', None)
def _on_result(self, greenlet): def _on_result(self, greenlet):
# This method can either be called in the hub greenlet (if the
# queue is unbounded) or its own greenlet. If it's called in
# its own greenlet, the calls to put() may block and switch
# greenlets, which in turn could mutate our state. So any
# state on this object that we need to look at, notably
# self.count, we need to capture or mutate *before* we put
self.count -= 1 self.count -= 1
count = self.count
finished = self.finished
ready = self.ready()
put_finished = False
if ready and count <= 0 and not finished:
finished = self.finished = True
put_finished = True
if greenlet.successful(): if greenlet.successful():
self.queue.put(self._iqueue_value_for_success(greenlet)) self.queue.put(self._iqueue_value_for_success(greenlet))
else: else:
self.queue.put(self._iqueue_value_for_failure(greenlet)) self.queue.put(self._iqueue_value_for_failure(greenlet))
if self.ready() and self.count <= 0 and not self.finished: if put_finished:
self.queue.put(self._iqueue_value_for_finished()) self.queue.put(self._iqueue_value_for_finished())
self.finished = True
def _on_finish(self, _self): def _on_finish(self, _self):
if self.finished: if self.finished:
return return
if not self.successful(): if not self.successful():
self.queue.put(self._iqueue_value_for_self_failure())
self.finished = True self.finished = True
self.queue.put(self._iqueue_value_for_self_failure())
return return
if self.count <= 0: if self.count <= 0:
self.queue.put(self._iqueue_value_for_finished())
self.finished = True self.finished = True
self.queue.put(self._iqueue_value_for_finished())
def _iqueue_value_for_success(self, greenlet): def _iqueue_value_for_success(self, greenlet):
return greenlet.value return greenlet.value
...@@ -121,11 +162,11 @@ class IMap(IMapUnordered): ...@@ -121,11 +162,11 @@ class IMap(IMapUnordered):
# We do this by storing tuples (order, value) in the queue # We do this by storing tuples (order, value) in the queue
# not just value. # not just value.
def __init__(self, func, iterable, spawn=None, _zipped=False): def __init__(self, *args, **kwargs):
self.waiting = [] # QQQ maybe deque will work faster there? self.waiting = [] # QQQ maybe deque will work faster there?
self.index = 0 self.index = 0
self.maxindex = -1 self.maxindex = -1
IMapUnordered.__init__(self, func, iterable, spawn, _zipped) IMapUnordered.__init__(self, *args, **kwargs)
def _inext(self): def _inext(self):
while True: while True:
...@@ -246,16 +287,64 @@ class GroupMappingMixin(object): ...@@ -246,16 +287,64 @@ class GroupMappingMixin(object):
""" """
return Greenlet.spawn(self.map_cb, func, iterable, callback) return Greenlet.spawn(self.map_cb, func, iterable, callback)
def imap(self, func, *iterables): def __imap(self, cls, func, *iterables, **kwargs):
"""An equivalent of itertools.imap()""" # Python 2 doesn't support the syntax that lets us mix varargs and
return IMap.spawn(func, izip(*iterables), spawn=self.spawn, # a named kwarg, so we have to unpack manually
_zipped=True) maxsize = kwargs.pop('maxsize', None)
if kwargs:
raise TypeError("Unsupported keyword arguments")
return cls.spawn(func, izip(*iterables), spawn=self.spawn,
_zipped=True, maxsize=maxsize)
def imap(self, func, *iterables, **kwargs):
"""
imap(func, *iterables, maxsize=None) -> iterable
def imap_unordered(self, func, *iterables): An equivalent of :func:`itertools.imap`, operating in parallel.
"""The same as imap() except that the ordering of the results from the The *func* is applied to each element yielded from each
returned iterator should be considered in arbitrary order.""" iterable in *iterables* in turn, collecting the result.
return IMapUnordered.spawn(func, izip(*iterables), spawn=self.spawn,
_zipped=True) If this object has a bound on the number of active greenlets it can
contain (such as :class:`Pool`), then at most that number of tasks will operate
in parallel.
:keyword int maxsize: If given and not-None, specifies the maximum number of
finished results that will be allowed to accumulate awaiting the reader;
more than that number of results will cause map function greenlets to begin
to block. This is most useful is there is a great disparity in the speed of
the mapping code and the consumer and the results consume a great deal of resources.
.. note:: This is separate from any bound on the number of active parallel
tasks.
.. note:: Using a bound is slightly more computationally expensive than not using a bound.
.. tip:: The :meth:`imap_unordered` method makes much better
use of this parameter. Some additional, unspecified,
number of objects may be required to be kept in memory
to maintain order by this function.
:return: An iterable object.
.. versionchanged:: 1.1b3
Added the *maxsize* keyword parameter.
"""
return self.__imap(IMap, func, *iterables, **kwargs)
def imap_unordered(self, func, *iterables, **kwargs):
"""
imap_unordered(func, *iterables, maxsize=None) -> iterable
The same as :meth:`imap` except that the ordering of the results
from the returned iterator should be considered in arbitrary
order.
This is lighter weight than :meth:`imap` and should be preferred if order
doesn't matter.
.. seealso:: :meth:`imap` for more details.
"""
return self.__imap(IMapUnordered, func, *iterables, **kwargs)
class Group(GroupMappingMixin): class Group(GroupMappingMixin):
......
...@@ -5,8 +5,6 @@ The :mod:`gevent.queue` module implements multi-producer, multi-consumer queues ...@@ -5,8 +5,6 @@ The :mod:`gevent.queue` module implements multi-producer, multi-consumer queues
that work across greenlets, with the API similar to the classes found in the that work across greenlets, with the API similar to the classes found in the
standard :mod:`Queue` and :class:`multiprocessing <multiprocessing.Queue>` modules. standard :mod:`Queue` and :class:`multiprocessing <multiprocessing.Queue>` modules.
Changed in version 1.0: Queue(0) now means queue of infinite size, not a channel.
The classes in this module implement iterator protocol. Iterating over queue The classes in this module implement iterator protocol. Iterating over queue
means repeatedly calling :meth:`get <Queue.get>` until :meth:`get <Queue.get>` returns ``StopIteration``. means repeatedly calling :meth:`get <Queue.get>` until :meth:`get <Queue.get>` returns ``StopIteration``.
...@@ -18,6 +16,10 @@ means repeatedly calling :meth:`get <Queue.get>` until :meth:`get <Queue.get>` r ...@@ -18,6 +16,10 @@ means repeatedly calling :meth:`get <Queue.get>` until :meth:`get <Queue.get>` r
... print(item) ... print(item)
1 1
2 2
.. versionchanged:: 1.0
``Queue(0)`` now means queue of infinite size, not a channel. A :exc:`DeprecationWarning`
will be issued with this argument.
""" """
from __future__ import absolute_import from __future__ import absolute_import
...@@ -40,9 +42,21 @@ __all__ = ['Queue', 'PriorityQueue', 'LifoQueue', 'JoinableQueue', 'Channel'] ...@@ -40,9 +42,21 @@ __all__ = ['Queue', 'PriorityQueue', 'LifoQueue', 'JoinableQueue', 'Channel']
class Queue(object): class Queue(object):
"""Create a queue object with a given maximum size. """
Create a queue object with a given maximum size.
If *maxsize* is less than or equal to zero or ``None``, the queue size is infinite.
If *maxsize* is less than or equal to zero or ``None``, the queue
size is infinite.
.. versionchanged:: 1.1b3
Queue's now support :func:`len`; it behaves the same as :meth:`qsize`.
.. versionchanged:: 1.1b3
Multiple greenlets that block on a call to :meth:`put` for a full queue
will now be woken up to put their items into the queue in the order in which
they arrived. Likewise, multiple greenlets that block on a call to :meth:`get` for
an empty queue will now receive items in the order in which they blocked. An
implementation quirk under CPython *usually* ensured this was roughly the case
previously anyway, but that wasn't the case for PyPy.
""" """
def __init__(self, maxsize=None, items=None): def __init__(self, maxsize=None, items=None):
...@@ -54,8 +68,20 @@ class Queue(object): ...@@ -54,8 +68,20 @@ class Queue(object):
DeprecationWarning, stacklevel=2) DeprecationWarning, stacklevel=2)
else: else:
self.maxsize = maxsize self.maxsize = maxsize
self.getters = set() # Explicitly maintain order for getters and putters that block
self.putters = set() # so that callers can consistently rely on getting things out
# in the apparent order they went in. This is required by
# imap_unordered. Previously these were set() objects, and the
# items put in the set have default hash() and eq() methods;
# under CPython, since new objects tend to have increasing
# hash values, this tended to roughly maintain order anyway,
# but that's not true under PyPy. An alternative to a deque
# (to avoid the linear scan of remove()) might be an
# OrderedDict, but it's 2.7 only; we don't expect to have so
# many waiters that removing an arbitrary element is a
# bottleneck, though.
self.getters = collections.deque()
self.putters = collections.deque()
self.hub = get_hub() self.hub = get_hub()
self._event_unlock = None self._event_unlock = None
if items: if items:
...@@ -108,6 +134,29 @@ class Queue(object): ...@@ -108,6 +134,29 @@ class Queue(object):
"""Return the size of the queue.""" """Return the size of the queue."""
return len(self.queue) return len(self.queue)
def __len__(self):
"""
Return the size of the queue. This is the same as :meth:`qsize`.
.. versionadded: 1.1b3
Previously, getting len() of a queue would raise a TypeError.
"""
return self.qsize()
def __bool__(self):
"""
A queue object is always True.
.. versionadded: 1.1b3
Now that queues support len(), they need to implement ``__bool__``
to return True for backwards compatibility.
"""
return True
__nonzero__ = __bool__
def empty(self): def empty(self):
"""Return ``True`` if the queue is empty, ``False`` otherwise.""" """Return ``True`` if the queue is empty, ``False`` otherwise."""
return not self.qsize() return not self.qsize()
...@@ -139,7 +188,7 @@ class Queue(object): ...@@ -139,7 +188,7 @@ class Queue(object):
# We're in the mainloop, so we cannot wait; we can switch to other greenlets though. # We're in the mainloop, so we cannot wait; we can switch to other greenlets though.
# Check if possible to get a free slot in the queue. # Check if possible to get a free slot in the queue.
while self.getters and self.qsize() and self.qsize() >= self.maxsize: while self.getters and self.qsize() and self.qsize() >= self.maxsize:
getter = self.getters.pop() getter = self.getters.popleft()
getter.switch(getter) getter.switch(getter)
if self.qsize() < self.maxsize: if self.qsize() < self.maxsize:
self._put(item) self._put(item)
...@@ -147,16 +196,20 @@ class Queue(object): ...@@ -147,16 +196,20 @@ class Queue(object):
raise Full raise Full
elif block: elif block:
waiter = ItemWaiter(item, self) waiter = ItemWaiter(item, self)
self.putters.add(waiter) self.putters.append(waiter)
timeout = Timeout.start_new(timeout, Full) timeout = Timeout.start_new(timeout, Full) if timeout is not None else None
try: try:
if self.getters: if self.getters:
self._schedule_unlock() self._schedule_unlock()
result = waiter.get() result = waiter.get()
assert result is waiter, "Invalid switch into Queue.put: %r" % (result, ) assert result is waiter, "Invalid switch into Queue.put: %r" % (result, )
finally: finally:
if timeout is not None:
timeout.cancel() timeout.cancel()
self.putters.discard(waiter) try:
self.putters.remove(waiter)
except ValueError:
pass # removed by unlock
else: else:
raise Full raise Full
...@@ -186,23 +239,27 @@ class Queue(object): ...@@ -186,23 +239,27 @@ class Queue(object):
# special case to make get_nowait() runnable in the mainloop greenlet # special case to make get_nowait() runnable in the mainloop greenlet
# there are no items in the queue; try to fix the situation by unlocking putters # there are no items in the queue; try to fix the situation by unlocking putters
while self.putters: while self.putters:
self.putters.pop().put_and_switch() self.putters.popleft().put_and_switch()
if self.qsize(): if self.qsize():
return self._get() return self._get()
raise Empty raise Empty
elif block: elif block:
waiter = Waiter() waiter = Waiter()
timeout = Timeout.start_new(timeout, Empty) timeout = Timeout.start_new(timeout, Empty) if timeout is not None else None
try: try:
self.getters.add(waiter) self.getters.append(waiter)
if self.putters: if self.putters:
self._schedule_unlock() self._schedule_unlock()
result = waiter.get() result = waiter.get()
assert result is waiter, 'Invalid switch into Queue.get: %r' % (result, ) assert result is waiter, 'Invalid switch into Queue.get: %r' % (result, )
return self._get() return self._get()
finally: finally:
self.getters.discard(waiter) if timeout is not None:
timeout.cancel() timeout.cancel()
try:
self.getters.remove(waiter)
except ValueError:
pass # Removed by _unlock
else: else:
raise Empty raise Empty
...@@ -259,7 +316,7 @@ class Queue(object): ...@@ -259,7 +316,7 @@ class Queue(object):
if self.putters and (self.maxsize is None or self.qsize() < self.maxsize): if self.putters and (self.maxsize is None or self.qsize() < self.maxsize):
repeat = True repeat = True
try: try:
putter = self.putters.pop() putter = self.putters.popleft()
self._put(putter.item) self._put(putter.item)
except: except:
putter.throw(*sys.exc_info()) putter.throw(*sys.exc_info())
...@@ -267,7 +324,7 @@ class Queue(object): ...@@ -267,7 +324,7 @@ class Queue(object):
putter.switch(putter) putter.switch(putter)
if self.getters and self.qsize(): if self.getters and self.qsize():
repeat = True repeat = True
getter = self.getters.pop() getter = self.getters.popleft()
getter.switch(getter) getter.switch(getter)
if not repeat: if not repeat:
return return
......
# Test idle
import gevent
gevent.sleep()
gevent.idle()
...@@ -380,6 +380,32 @@ class TestPool(greentest.TestCase): ...@@ -380,6 +380,32 @@ class TestPool(greentest.TestCase):
result = list(self.pool.imap_unordered(sqr, final_sleep())) result = list(self.pool.imap_unordered(sqr, final_sleep()))
self.assertEqual(result, [0, 1, 4]) self.assertEqual(result, [0, 1, 4])
# Issue 638
def test_imap_unordered_bounded_queue(self):
iterable = list(range(100))
def short_running_func(i, j):
return i
# Send two iterables to make sure varargs and kwargs are handled
# correctly
mapping = self.pool.imap_unordered(short_running_func, iterable, iterable,
maxsize=1)
mapping.start()
# Simulate a long running reader. No matter how many workers
# we have, we will never have a queue more than size 1
def reader():
result = []
for x in mapping:
result.append(x)
gevent.sleep(0.01)
self.assertEqual(len(mapping.queue), 1)
return result
l = reader()
self.assertEqual(sorted(l), iterable)
class TestPool2(TestPool): class TestPool2(TestPool):
size = 2 size = 2
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment