blob: ce07ebeb142d926aa54b124eca8623b8c483eeb4 [file] [log] [blame]
Olivier Deprezf4ef2d02021-04-20 13:36:24 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
9
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
12from builtins import open as _builtin_open
13import io
14import os
15import _compression
16from threading import RLock
17
18from _bz2 import BZ2Compressor, BZ2Decompressor
19
20
21_MODE_CLOSED = 0
22_MODE_READ = 1
23# Value 2 no longer used
24_MODE_WRITE = 3
25
26
27class BZ2File(_compression.BaseStream):
28
29 """A file object providing transparent bzip2 (de)compression.
30
31 A BZ2File can act as a wrapper for an existing file object, or refer
32 directly to a named file on disk.
33
34 Note that BZ2File provides a *binary* file interface - data read is
35 returned as bytes, and data to be written should be given as bytes.
36 """
37
38 def __init__(self, filename, mode="r", *, compresslevel=9):
39 """Open a bzip2-compressed file.
40
41 If filename is a str, bytes, or PathLike object, it gives the
42 name of the file to be opened. Otherwise, it should be a file
43 object, which will be used to read or write the compressed data.
44
45 mode can be 'r' for reading (default), 'w' for (over)writing,
46 'x' for creating exclusively, or 'a' for appending. These can
47 equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
48
49 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
50 and 9 specifying the level of compression: 1 produces the least
51 compression, and 9 (default) produces the most compression.
52
53 If mode is 'r', the input file may be the concatenation of
54 multiple compressed streams.
55 """
56 # This lock must be recursive, so that BufferedIOBase's
57 # writelines() does not deadlock.
58 self._lock = RLock()
59 self._fp = None
60 self._closefp = False
61 self._mode = _MODE_CLOSED
62
63 if not (1 <= compresslevel <= 9):
64 raise ValueError("compresslevel must be between 1 and 9")
65
66 if mode in ("", "r", "rb"):
67 mode = "rb"
68 mode_code = _MODE_READ
69 elif mode in ("w", "wb"):
70 mode = "wb"
71 mode_code = _MODE_WRITE
72 self._compressor = BZ2Compressor(compresslevel)
73 elif mode in ("x", "xb"):
74 mode = "xb"
75 mode_code = _MODE_WRITE
76 self._compressor = BZ2Compressor(compresslevel)
77 elif mode in ("a", "ab"):
78 mode = "ab"
79 mode_code = _MODE_WRITE
80 self._compressor = BZ2Compressor(compresslevel)
81 else:
82 raise ValueError("Invalid mode: %r" % (mode,))
83
84 if isinstance(filename, (str, bytes, os.PathLike)):
85 self._fp = _builtin_open(filename, mode)
86 self._closefp = True
87 self._mode = mode_code
88 elif hasattr(filename, "read") or hasattr(filename, "write"):
89 self._fp = filename
90 self._mode = mode_code
91 else:
92 raise TypeError("filename must be a str, bytes, file or PathLike object")
93
94 if self._mode == _MODE_READ:
95 raw = _compression.DecompressReader(self._fp,
96 BZ2Decompressor, trailing_error=OSError)
97 self._buffer = io.BufferedReader(raw)
98 else:
99 self._pos = 0
100
101 def close(self):
102 """Flush and close the file.
103
104 May be called more than once without error. Once the file is
105 closed, any other operation on it will raise a ValueError.
106 """
107 with self._lock:
108 if self._mode == _MODE_CLOSED:
109 return
110 try:
111 if self._mode == _MODE_READ:
112 self._buffer.close()
113 elif self._mode == _MODE_WRITE:
114 self._fp.write(self._compressor.flush())
115 self._compressor = None
116 finally:
117 try:
118 if self._closefp:
119 self._fp.close()
120 finally:
121 self._fp = None
122 self._closefp = False
123 self._mode = _MODE_CLOSED
124 self._buffer = None
125
126 @property
127 def closed(self):
128 """True if this file is closed."""
129 return self._mode == _MODE_CLOSED
130
131 def fileno(self):
132 """Return the file descriptor for the underlying file."""
133 self._check_not_closed()
134 return self._fp.fileno()
135
136 def seekable(self):
137 """Return whether the file supports seeking."""
138 return self.readable() and self._buffer.seekable()
139
140 def readable(self):
141 """Return whether the file was opened for reading."""
142 self._check_not_closed()
143 return self._mode == _MODE_READ
144
145 def writable(self):
146 """Return whether the file was opened for writing."""
147 self._check_not_closed()
148 return self._mode == _MODE_WRITE
149
150 def peek(self, n=0):
151 """Return buffered data without advancing the file position.
152
153 Always returns at least one byte of data, unless at EOF.
154 The exact number of bytes returned is unspecified.
155 """
156 with self._lock:
157 self._check_can_read()
158 # Relies on the undocumented fact that BufferedReader.peek()
159 # always returns at least one byte (except at EOF), independent
160 # of the value of n
161 return self._buffer.peek(n)
162
163 def read(self, size=-1):
164 """Read up to size uncompressed bytes from the file.
165
166 If size is negative or omitted, read until EOF is reached.
167 Returns b'' if the file is already at EOF.
168 """
169 with self._lock:
170 self._check_can_read()
171 return self._buffer.read(size)
172
173 def read1(self, size=-1):
174 """Read up to size uncompressed bytes, while trying to avoid
175 making multiple reads from the underlying stream. Reads up to a
176 buffer's worth of data if size is negative.
177
178 Returns b'' if the file is at EOF.
179 """
180 with self._lock:
181 self._check_can_read()
182 if size < 0:
183 size = io.DEFAULT_BUFFER_SIZE
184 return self._buffer.read1(size)
185
186 def readinto(self, b):
187 """Read bytes into b.
188
189 Returns the number of bytes read (0 for EOF).
190 """
191 with self._lock:
192 self._check_can_read()
193 return self._buffer.readinto(b)
194
195 def readline(self, size=-1):
196 """Read a line of uncompressed bytes from the file.
197
198 The terminating newline (if present) is retained. If size is
199 non-negative, no more than size bytes will be read (in which
200 case the line may be incomplete). Returns b'' if already at EOF.
201 """
202 if not isinstance(size, int):
203 if not hasattr(size, "__index__"):
204 raise TypeError("Integer argument expected")
205 size = size.__index__()
206 with self._lock:
207 self._check_can_read()
208 return self._buffer.readline(size)
209
210 def readlines(self, size=-1):
211 """Read a list of lines of uncompressed bytes from the file.
212
213 size can be specified to control the number of lines read: no
214 further lines will be read once the total size of the lines read
215 so far equals or exceeds size.
216 """
217 if not isinstance(size, int):
218 if not hasattr(size, "__index__"):
219 raise TypeError("Integer argument expected")
220 size = size.__index__()
221 with self._lock:
222 self._check_can_read()
223 return self._buffer.readlines(size)
224
225 def write(self, data):
226 """Write a byte string to the file.
227
228 Returns the number of uncompressed bytes written, which is
229 always len(data). Note that due to buffering, the file on disk
230 may not reflect the data written until close() is called.
231 """
232 with self._lock:
233 self._check_can_write()
234 compressed = self._compressor.compress(data)
235 self._fp.write(compressed)
236 self._pos += len(data)
237 return len(data)
238
239 def writelines(self, seq):
240 """Write a sequence of byte strings to the file.
241
242 Returns the number of uncompressed bytes written.
243 seq can be any iterable yielding byte strings.
244
245 Line separators are not added between the written byte strings.
246 """
247 with self._lock:
248 return _compression.BaseStream.writelines(self, seq)
249
250 def seek(self, offset, whence=io.SEEK_SET):
251 """Change the file position.
252
253 The new position is specified by offset, relative to the
254 position indicated by whence. Values for whence are:
255
256 0: start of stream (default); offset must not be negative
257 1: current stream position
258 2: end of stream; offset must not be positive
259
260 Returns the new file position.
261
262 Note that seeking is emulated, so depending on the parameters,
263 this operation may be extremely slow.
264 """
265 with self._lock:
266 self._check_can_seek()
267 return self._buffer.seek(offset, whence)
268
269 def tell(self):
270 """Return the current file position."""
271 with self._lock:
272 self._check_not_closed()
273 if self._mode == _MODE_READ:
274 return self._buffer.tell()
275 return self._pos
276
277
278def open(filename, mode="rb", compresslevel=9,
279 encoding=None, errors=None, newline=None):
280 """Open a bzip2-compressed file in binary or text mode.
281
282 The filename argument can be an actual filename (a str, bytes, or
283 PathLike object), or an existing file object to read from or write
284 to.
285
286 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
287 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
288 The default mode is "rb", and the default compresslevel is 9.
289
290 For binary mode, this function is equivalent to the BZ2File
291 constructor: BZ2File(filename, mode, compresslevel). In this case,
292 the encoding, errors and newline arguments must not be provided.
293
294 For text mode, a BZ2File object is created, and wrapped in an
295 io.TextIOWrapper instance with the specified encoding, error
296 handling behavior, and line ending(s).
297
298 """
299 if "t" in mode:
300 if "b" in mode:
301 raise ValueError("Invalid mode: %r" % (mode,))
302 else:
303 if encoding is not None:
304 raise ValueError("Argument 'encoding' not supported in binary mode")
305 if errors is not None:
306 raise ValueError("Argument 'errors' not supported in binary mode")
307 if newline is not None:
308 raise ValueError("Argument 'newline' not supported in binary mode")
309
310 bz_mode = mode.replace("t", "")
311 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
312
313 if "t" in mode:
314 return io.TextIOWrapper(binary_file, encoding, errors, newline)
315 else:
316 return binary_file
317
318
319def compress(data, compresslevel=9):
320 """Compress a block of data.
321
322 compresslevel, if given, must be a number between 1 and 9.
323
324 For incremental compression, use a BZ2Compressor object instead.
325 """
326 comp = BZ2Compressor(compresslevel)
327 return comp.compress(data) + comp.flush()
328
329
330def decompress(data):
331 """Decompress a block of data.
332
333 For incremental decompression, use a BZ2Decompressor object instead.
334 """
335 results = []
336 while data:
337 decomp = BZ2Decompressor()
338 try:
339 res = decomp.decompress(data)
340 except OSError:
341 if results:
342 break # Leftover data is not a valid bzip2 stream; ignore it.
343 else:
344 raise # Error on the first iteration; bail out.
345 results.append(res)
346 if not decomp.eof:
347 raise ValueError("Compressed data ended before the "
348 "end-of-stream marker was reached")
349 data = decomp.unused_data
350 return b"".join(results)