Python mailbox.mbox
is not good at opening compressed mailboxes:
>>> import mailbox
>>> print(len(mailbox.mbox("/tmp/test.mbox")))
9
>>> print(len(mailbox.mbox("/tmp/test.mbox.gz")))
0
>>> print(len(mailbox.mbox("/tmp/test1.mbox.xz")))
0
For a prototype rewrite of the MIA team's Echelon (the engine behind mia-query), I needed to scan compressed mailboxes, and I had to work around this limitation.
Here is the alternative mailbox.mbox
implementation:
import lzma
import gzip
import bz2
import mailbox
class StreamMbox(mailbox.mbox):
"""
mailbox.mbox does not support opening a stream, which is sad.
This is a subclass that works around it
"""
def __init__(self, fd: BinaryIO, factory=None, create: bool = True):
# Do not call parent __init__, just redo everything here to be able to
# open a stream. This will need to be re-reviewed for every new version
# of python's stdlib.
# Mailbox constructor
self._path = None
self._factory = factory
# _singlefileMailbox constructor
self._file = fd
self._toc = None
self._next_key = 0
self._pending = False # No changes require rewriting the file.
self._pending_sync = False # No need to sync the file
self._locked = False
self._file_length = None # Used to record mailbox size
# mbox constructor
self._message_factory = mailbox.mboxMessage
def flush(self):
raise NotImplementedError("StreamMbox is a readonly class")
class UsageExample:
DECOMPRESS = {
".xz": lzma.open,
".gz": gzip.open,
".bz2": bz2.open,
}
@classmethod
def scan(cls, path: Path) -> Generator[ScannedEmail, None, None]:
decompress = cls.DECOMPRESS.get(path.suffix)
if decompress is None:
with open(path.as_posix(), "rb") as fd:
yield from cls.scan_fd(path, fd)
else:
with decompress(path.as_posix(), "rb") as fd:
yield from cls.scan_fd(path, fd)
@classmethod
def scan_fd(cls, path: Path, fd: BinaryIO) -> Generator[ScannedEmail, None, None]:
mbox = StreamMbox(fd)
for msg in mbox:
...