Skip to content

Reading

tabbed.reading

A reader of text delimited files that supports the following features:

  • Identification of metadata & header file sections.
  • Automated type conversion to ints, floats, complex numbers, times, dates and datetime instances.
  • Selective reading of rows and columns satisfying equality, membership, regular expression, and rich comparison conditions.
  • Iterative reading of rows from the input file.

tabbed.reading.Reader

Bases: ReprMixin

An iterative reader of irregular text files supporting selective value-based reading of rows and columns.

A common variant to the RFC-4180 CSV standard includes metadata prior to a possible header and data section. This reader sniffs files for these sections advancing to the most-likely start position of the data. Additionally, it uses type inference to automatically convert data cells into strings, integers, floats, complex, time, date or datetime instances. Finally, this reader supports selective reading of rows using equality, membership, comparison, & regular expression value-based conditions supplied as keyword arguments to the 'tab' method.

Attributes:

Name Type Description
infile

An I/O stream instance returned by open.

tabulator

A callable container of Tab instances; callables that will filter rows based on equality, membership, rich comparison and regular expression conditions.

errors

A container of casting and ragged length errors detected during reading.

Examples:

>>> # Create a temporary file for reading
>>> import os
>>> import tempfile
>>> import random
>>> from datetime import datetime, timedelta
>>> # make metadata that spans several lines
>>> metadata_string = ('Experiment, 3\n'
... 'Name, Ernst Rutherford\n'
... 'location, Cavendish Labs\n'
... 'Time, 11:03:29.092\n'
... 'Date, 8/23/1917\n'
... '\n')
>>> # make a header of 5 columns
>>> header = ['group', 'count', 'color', 'time', 'area']
>>> header_string = ','.join(header) + '\n'
>>> # make a reproducible data section with 20 rows
>>> random.seed(0)
>>> groups = random.choices(['a', 'b', 'c'], k=20)
>>> counts = [str(random.randint(0, 10)) for _ in range(20)]
>>> colors = random.choices(['red', 'green', 'blue'], k=20)
>>> start = datetime(1917, 8, 23, 11, 3, 29, 9209)
>>> times = [str(start + timedelta(seconds=10*i)) for i in range(20)]
>>> areas = [str(random.uniform(0, 10)) for _ in range(20)]
>>> x = [','.join(row) for row in zip(
...    groups, counts, colors, times, areas)]
>>> data_string = '\r\n'.join(x)
>>> # write the metadata, header and data strings
>>> fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
>>> _ = fp.write(metadata_string)
>>> _ = fp.write(header_string)
>>> _ = fp.write(data_string)
>>> fp.close()
>>> # open the file for reading
>>> infile = open(fp.name, mode='r')
>>> reader = Reader(infile)
>>> # ask the reader for the header
>>> reader.header
...
Header(line=6,
names=['group', 'count', 'color', 'time', 'area'],
string='group,count,color,time,area')
>>> # read group, count & area columns where group is a or c & 0 < area <=4
>>> # by passing keyword args to this reader's 'tab' method
>>> reader.tab(columns=['group', 'count', 'area'],
... group=['a', 'c'],
... area='> 0 and <= 4')
>>> # read the data with a chunksize of 3 rows
>>> rows = reader.read(chunksize=3)
>>> type(rows) # rows are of type generator yielding 3 rows at a time
<class 'generator'>
>>> for idx, chunk in enumerate(rows):
...     print(f'Index = {idx}\n{chunk}')
...
Index = 0
[{'group': 'c', 'count': 4, 'area': 3.2005460467254574},
{'group': 'a', 'count': 10, 'area': 1.0905784593110368},
{'group': 'c', 'count': 7, 'area': 2.90329502402758}]
Index = 1
[{'group': 'c', 'count': 8, 'area': 1.8939132855435614},
{'group': 'c', 'count': 4, 'area': 1.867295282555551}]
>>> # close reader since it was not opened with context manager
>>> reader.close()
>>> os.remove(fp.name) # explicitly remove the tempfile
Source code in src/tabbed/reading.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
class Reader(ReprMixin):
    r"""An iterative reader of irregular text files supporting selective
    value-based reading of rows and columns.

    A common variant to the RFC-4180 CSV standard includes metadata prior to
    a possible header and data section. This reader sniffs files for these
    sections advancing to the most-likely start position of the data.
    Additionally, it uses type inference to automatically convert data cells
    into strings, integers, floats, complex, time, date or datetime instances.
    Finally, this reader supports selective reading of rows using equality,
    membership, comparison, & regular expression value-based conditions supplied
    as keyword arguments to the 'tab' method.

    Attributes:
        infile:
            An I/O stream instance returned by open.
        tabulator:
            A callable container of Tab instances; callables that will filter
            rows based on equality, membership, rich comparison and regular
            expression conditions.
        errors:
            A container of casting and ragged length errors detected during
            reading.

    Examples:
        >>> # Create a temporary file for reading
        >>> import os
        >>> import tempfile
        >>> import random
        >>> from datetime import datetime, timedelta
        >>> # make metadata that spans several lines
        >>> metadata_string = ('Experiment, 3\n'
        ... 'Name, Ernst Rutherford\n'
        ... 'location, Cavendish Labs\n'
        ... 'Time, 11:03:29.092\n'
        ... 'Date, 8/23/1917\n'
        ... '\n')
        >>> # make a header of 5 columns
        >>> header = ['group', 'count', 'color', 'time', 'area']
        >>> header_string = ','.join(header) + '\n'
        >>> # make a reproducible data section with 20 rows
        >>> random.seed(0)
        >>> groups = random.choices(['a', 'b', 'c'], k=20)
        >>> counts = [str(random.randint(0, 10)) for _ in range(20)]
        >>> colors = random.choices(['red', 'green', 'blue'], k=20)
        >>> start = datetime(1917, 8, 23, 11, 3, 29, 9209)
        >>> times = [str(start + timedelta(seconds=10*i)) for i in range(20)]
        >>> areas = [str(random.uniform(0, 10)) for _ in range(20)]
        >>> x = [','.join(row) for row in zip(
        ...    groups, counts, colors, times, areas)]
        >>> data_string = '\r\n'.join(x)
        >>> # write the metadata, header and data strings
        >>> fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
        >>> _ = fp.write(metadata_string)
        >>> _ = fp.write(header_string)
        >>> _ = fp.write(data_string)
        >>> fp.close()
        >>> # open the file for reading
        >>> infile = open(fp.name, mode='r')
        >>> reader = Reader(infile)
        >>> # ask the reader for the header
        >>> reader.header
        ... # doctest: +NORMALIZE_WHITESPACE
        Header(line=6,
        names=['group', 'count', 'color', 'time', 'area'],
        string='group,count,color,time,area')
        >>> # read group, count & area columns where group is a or c & 0 < area <=4
        >>> # by passing keyword args to this reader's 'tab' method
        >>> reader.tab(columns=['group', 'count', 'area'],
        ... group=['a', 'c'],
        ... area='> 0 and <= 4')
        >>> # read the data with a chunksize of 3 rows
        >>> rows = reader.read(chunksize=3)
        >>> type(rows) # rows are of type generator yielding 3 rows at a time
        <class 'generator'>
        >>> for idx, chunk in enumerate(rows):
        ...     print(f'Index = {idx}\n{chunk}')
        ...     # doctest: +NORMALIZE_WHITESPACE
        Index = 0
        [{'group': 'c', 'count': 4, 'area': 3.2005460467254574},
        {'group': 'a', 'count': 10, 'area': 1.0905784593110368},
        {'group': 'c', 'count': 7, 'area': 2.90329502402758}]
        Index = 1
        [{'group': 'c', 'count': 8, 'area': 1.8939132855435614},
        {'group': 'c', 'count': 4, 'area': 1.867295282555551}]
        >>> # close reader since it was not opened with context manager
        >>> reader.close()
        >>> os.remove(fp.name) # explicitly remove the tempfile
    """

    # no mutation of exclude parameter
    # pylint: disable-next=dangerous-default-value
    def __init__(
        self,
        infile: IO[str],
        poll: int = 20,
        exclude: List[str] = ['', ' ', '-', 'nan', 'NaN', 'NAN'],
        **sniffing_kwargs,
    ) -> None:
        """Initialize this Reader.

        Args:
            infile:
                An IO stream instance returned by open builtin.
            poll:
                The number of last sample rows to use for the Sniffer to detect
                header, metadata and data types.
            exclude:
               A sequence of characters indicating missing values in the file.
               Rows containing these values will be disqualified from use for
               header, metadata and data type detection. However, this Reader's
               read method will still read and return rows with this exclusion
               values.
            sniffing_kwargs:
                Any valid kwarg for a tabbed Sniffer instance including: start,
                amount, skips and delimiters. Please see Sniffer initializer.

        Notes:
            During initialization, this reader will use the poll and exclude
            arguments to make an initial guess of the header. If this guess is
            wrong, the header may be explicitly set via the 'header' setter
            property.
        """

        self.infile = infile
        self._sniffer = Sniffer(infile, **sniffing_kwargs)
        self.poll = poll
        self.exclude = exclude
        self._header = self._sniffer.header(self.poll, self.exclude)
        self.tabulator = Tabulator(self.header, columns=None, tabs=None)
        self.errors = SimpleNamespace(casting=[], ragged=[])

    @property
    def sniffer(self) -> Sniffer:
        """Returns this Reader's sniffer instance.

        Any time the sniffer is accessed we reset this reader's header and
        tabulator if the header is built by the sniffer.
        """

        if self._header.line is not None:
            # print('Resniffing Header and resetting metadata and Tabulator')
            self._header = self._sniffer.header(self.poll, self.exclude)
            self.tabulator = Tabulator(self.header, columns=None, tabs=None)

        return self._sniffer

    @property
    def header(self) -> Header:
        """Fetches this Reader's current header."""

        return self._header

    @header.setter
    def header(self, value: int | List[str] | Dict) -> None:
        """Sets this Reader's header and resets the metadata and Tabulator.

        Args:
            value:
                An infile line number, list of string names, or dict of keyword
                arguments for sniffer's header method. If value is type int, the
                header will be set to the split string values of the value row
                of infile. If value is type List, the header will be set to the
                string names in value. If value is type dict, the header will be
                resniffed by sniffer's header method using value keyword args.
                Valid keyword arguments are: 'poll', and 'exclude'. Please type
                help(reader.sniffer.header) for more argument details.

        Returns:
            None

        Raises:
            A ValueError is issued if value is int or List type and the length
            of the proposed header names does not match the length of the last
            sample row in the sniffer.
        """

        # get the expected length of the header from the last sample row.
        expected = len(self._sniffer.rows[-1])

        if isinstance(value, int):
            sniff = Sniffer(self.infile, start=value, amount=1)
            if len(sniff.rows[0]) != expected:
                msg = (
                    f'Length of row at index = {value} does not match'
                    f'length of last sample row = {expected}'
                )
                raise ValueError(msg)
            result = Header(value, sniff.rows[0], sniff.sample)

        elif isinstance(value, list):
            if len(value) != expected:
                msg = (
                    f'Length of provided header names = {len(value)} does '
                    f'not match length of last sample row = {expected}'
                )
                raise ValueError(msg)
            result = Header(None, value, None)

        elif isinstance(value, dict):
            result = self._sniffer.header(**value)

        else:
            msg = (
                "A header may be set by integer line number, list of "
                "header names or a dict of kwargs for sniffer's header "
                f"method but not type {type(value)}."
            )
            raise ValueError(msg)

        # set header
        self._header = result
        # determine if reader has previously set tabulator and warn
        previous = self.tabulator
        tblr = Tabulator(self.header, tabs=None, columns=None)
        if tblr.columns != previous.columns or tblr.tabs != previous.tabs:
            msg = (
                "Previously set tabs have been reset. Please call 'tab' "
                "method again before reading."
            )
            print(msg)

        self.tabulator = tblr

    def metadata(self) -> MetaData:
        """Returns this Reader's current metadata.

        Returns:
            A sniffed metadata instance.
        """

        return self._sniffer.metadata(self.header, self.poll, self.exclude)

    def tab(
        self,
        columns: Optional[List[str] | List[int] | re.Pattern] = None,
        **tabs: (
            CellType
            | Sequence[CellType]
            | re.Pattern
            | Callable[[Dict[str, CellType], str], bool]
        ),
    ) -> None:
        """Set the Tabulator instance that will filter infile's rows & columns.

        A tabulator is a container of tab instances that when called on a row,
        sequentially applies each tab to that row. Additionally after applying
        the row tabs it filters the result by columns. Implementation details
        may be found in the tabbed.tabs module.

        Args:
            columns:
                Columns in each row to return during reading as a list of string
                names, a list of column indices or a compiled regular expression
                pattern to match against header names. If None, all the columns
                in the header will be read during a read call.
            tabs:
                name = value keyword argument pairs where name is a valid header
                column name and value may be of type string, int, float,
                complex, time, date, datetime, regular expression or callable.

                - If a string type with rich comparison(s) is provided,
                  a comparison tab is constructed.
                - If a string, int, float, complex, time, date  or datetime is
                  provided, an equality tab is constructed.
                - If a sequence is provided, a membership tab is constructed.
                - If a compiled re pattern, a Regex tab is constructed. See
                  class docs for example.

        Returns:
            None
        """

        self.tabulator = tabbing.Tabulator.from_keywords(
            self.header, columns, **tabs
        )

    def _log_ragged(self, line, row, raise_error):
        """Error logs rows whose length is unexpected.

        When python's csv DictReader encounters a row with more cells than
        header columns, it stores the additional cells to a list under the None
        key.  When the csv DictReader encounters a row that with fewer cells
        than header columns it inserts None values into the missing cells. This
        function detects rows with None keys or None values and logs the row
        number to the error log.

        Args:
            line:
                The line number of the row being tested.
            row:
                A row dictionary of header names and casted values.
            raise_error:
                A boolean indicating if ragged should raise an error and stop
                the reading of the file if a ragged row is encountered.

        Returns:
            The row with None restkey popped
        """

        remainder = row.pop(None, None)
        none_vals = None in row.values()

        if remainder is not None or none_vals:
            msg = f'Unexpected line length on row {line}'
            if raise_error:
                raise csv.Error(msg)
            self.errors.ragged.append(msg)

        return row

    def _prime(
        self,
        start: Optional[int] = None,
        indices: Optional[Sequence] = None,
    ) -> Tuple[Iterator, int]:
        """Prime this Reader for reading by constructing a row iterator.

        Args:
            start:
                An integer line number from the start of the file to begin
                reading data. If None and this reader's header has a line
                number, the line following the header line is the start. If None
                and the header line is None, the line following the metadata
                section is the start. If None and the file has no header or
                metadata, start is 0. If indices are provided, this argument is
                ignored.
            indices:
                An optional Sequence of line numbers to read rows relative to
                the start of the file. If None, all rows from start not in skips
                will be read. If reading a slice of the file, a range instance
                will have improved performance over list or tuple sequence
                types.

        Notes:
            A warning is issued if the start or index start is less than the
            detected start of the datasection.

        Returns:
            A row iterator & row index the iterator starts from.
        """

        # locate the start of the datasection
        autostart = 0
        if self.header.line is not None:
            autostart = self.header.line + 1
        else:
            metalines = self._sniffer.metadata(
                None, self.poll, self.exclude
            ).lines
            autostart = metalines[1] + 1 if metalines[1] else metalines[0]

        astart = start if start is not None else autostart
        stop = None
        step = None

        # indices if provided override start, stop and step
        if indices:

            if isinstance(indices, range):
                astart, stop, step = indices.start, indices.stop, indices.step

            elif isinstance(indices, Sequence):
                astart, stop = indices[0], indices[-1]

            else:
                msg = f'indices must be a Sequence type not {type(indices)}.'
                raise TypeError(msg)

        # warn if start is < computed autostart
        if astart < autostart:
            msg = f'start = {start} is < than detected data start = {autostart}'
            warnings.warn(msg)

        # advance reader's infile to account for blank metalines & get dialect
        self.infile.seek(0)

        # check that we have a valid simple dialect & convert it
        if not self._sniffer.dialect:
            msg = (
                "Sniffer failed to detect dialect. Please set sniffer's"
                "dialect attribute before calling read"
            )
            raise csv.Error(msg)
        assert isinstance(self._sniffer.dialect, SimpleDialect)
        dialect = self._sniffer.dialect.to_csv_dialect()

        # pylint: disable-next=expression-not-assigned
        [next(self.infile) for _ in range(astart)]
        row_iter = csv.DictReader(
            self.infile,
            self.header.names,
            dialect=dialect,
        )

        stop = stop - astart if stop else None
        return itertools.islice(row_iter, 0, stop, step), astart

    # read method needs provide reasonable options for args
    # pylint: disable-next=too-many-positional-arguments
    def read(
        self,
        start: Optional[int] = None,
        skips: Optional[Sequence[int]] = None,
        indices: Optional[Sequence] = None,
        chunksize: int = int(2e5),
        skip_empty: bool = True,
        raise_ragged: bool = False,
    ) -> Iterator[List[Dict[str, CellType]]]:
        """Iteratively read dictionary rows that satisfy this Reader's tabs.

        Args:
            start:
                A line number from the start of the file to begin reading data
                from. If None and this reader's header has a line number, the
                line following the header is the start. If None and the header
                line number is None, the line following the last line in the
                metadata is the start. If None and there is no header or
                metadata, the start line is 0.
            skips:
                A sequence of line numbers to skip during reading.
            indices:
                A sequence of line numbers to read rows from. If None. all rows
                from start not in skips will be read. If attempting to read
                a slice of a file a range instance may be provided and will have
                improved performance over other sequence types like lists.
            chunksize:
                The number of data lines to read for each yield. Lower values
                consume less memory. The default is 200,000 rows.
            skip_empty:
                A boolean indicating if rows with no values between the
                delimiters should be skipped. Default is True.
            raise_ragged:
                Boolean indicating if a row with more or fewer columns than
                expected should raise an error and stop reading. The default is
                False. Rows with fewer columns than the header will have None
                as  the fill value. Rows with more columns than the header will
                have None keys.

        Yields:
            Chunksize number of dictionary rows with header names as keys.

        Raises:
            csv.Error:
                A csv.Error is issued if a ragged row is encountered and
                raise_ragged is True. Casting problems do not raise errors but
                gracefully return strings when encountered.
        """

        skips = [] if not skips else skips

        # poll types & formats, inconsistencies will trigger casting error log
        types, _ = self._sniffer.types(self.poll, self.exclude)
        formats, _ = self._sniffer.datetime_formats(self.poll, self.exclude)
        castings = dict(zip(self.header.names, zip(types, formats)))

        # initialize casting and ragged row errors
        self.errors.casting = []
        self.errors.ragged = []

        # construct a row iterator
        row_iter, row_start = self._prime(start, indices)

        fifo: Deque[Dict[str, CellType]] = deque()
        for line, dic in enumerate(row_iter, row_start):

            if line in skips:
                continue

            if indices and line not in indices:
                continue

            if not any(dic.values()) and skip_empty:
                continue

            # chk & log raggedness
            dic = self._log_ragged(line, dic, raise_ragged)

            # perform casts, log errors & filter with tabulator
            arow = {}
            for name, astr in dic.items():

                casting, fmt = castings[name]
                try:
                    arow[name] = parsing.convert(astr, casting, fmt)
                except (ValueError, OverflowError, TypeError):
                    # on exception leave astr unconverted & log casting error
                    msg = f"line = {line}, column = '{name}'"
                    self.errors.casting.append(msg)
                    arow[name] = astr

            # apply tabs to filter row
            row = self.tabulator(arow)

            if row:
                fifo.append(row)

            if len(fifo) >= chunksize:
                yield [fifo.popleft() for _ in range(chunksize)]

        yield list(fifo)
        self.infile.seek(0)

    def peek(self, count: int = 10) -> None:
        """Prints count number of lines from the first line of the file.

        This method can be used to ensure this Reader identifies the correct
        metadata, header and data start locations.

        Args:
            count:
                The number of lines to print.

        Returns:
            None
        """

        cnt = 0
        while cnt < count:
            CRED = '\033[91m'
            CEND = '\033[0m'
            print(CRED + f'{cnt}' + CEND, next(self.infile).rstrip())
            cnt += 1

        self.infile.seek(0)

    def close(self):
        """Closes this Reader's infile resource."""

        self.infile.close()

sniffer property

Returns this Reader's sniffer instance.

Any time the sniffer is accessed we reset this reader's header and tabulator if the header is built by the sniffer.

header property writable

Fetches this Reader's current header.

__init__(infile, poll=20, exclude=['', ' ', '-', 'nan', 'NaN', 'NAN'], **sniffing_kwargs)

Initialize this Reader.

Parameters:

Name Type Description Default
infile IO[str]

An IO stream instance returned by open builtin.

required
poll int

The number of last sample rows to use for the Sniffer to detect header, metadata and data types.

20
exclude List[str]

A sequence of characters indicating missing values in the file. Rows containing these values will be disqualified from use for header, metadata and data type detection. However, this Reader's read method will still read and return rows with this exclusion values.

['', ' ', '-', 'nan', 'NaN', 'NAN']
sniffing_kwargs

Any valid kwarg for a tabbed Sniffer instance including: start, amount, skips and delimiters. Please see Sniffer initializer.

{}
Notes

During initialization, this reader will use the poll and exclude arguments to make an initial guess of the header. If this guess is wrong, the header may be explicitly set via the 'header' setter property.

Source code in src/tabbed/reading.py
def __init__(
    self,
    infile: IO[str],
    poll: int = 20,
    exclude: List[str] = ['', ' ', '-', 'nan', 'NaN', 'NAN'],
    **sniffing_kwargs,
) -> None:
    """Initialize this Reader.

    Args:
        infile:
            An IO stream instance returned by open builtin.
        poll:
            The number of last sample rows to use for the Sniffer to detect
            header, metadata and data types.
        exclude:
           A sequence of characters indicating missing values in the file.
           Rows containing these values will be disqualified from use for
           header, metadata and data type detection. However, this Reader's
           read method will still read and return rows with this exclusion
           values.
        sniffing_kwargs:
            Any valid kwarg for a tabbed Sniffer instance including: start,
            amount, skips and delimiters. Please see Sniffer initializer.

    Notes:
        During initialization, this reader will use the poll and exclude
        arguments to make an initial guess of the header. If this guess is
        wrong, the header may be explicitly set via the 'header' setter
        property.
    """

    self.infile = infile
    self._sniffer = Sniffer(infile, **sniffing_kwargs)
    self.poll = poll
    self.exclude = exclude
    self._header = self._sniffer.header(self.poll, self.exclude)
    self.tabulator = Tabulator(self.header, columns=None, tabs=None)
    self.errors = SimpleNamespace(casting=[], ragged=[])

metadata()

Returns this Reader's current metadata.

Returns:

Type Description
MetaData

A sniffed metadata instance.

Source code in src/tabbed/reading.py
def metadata(self) -> MetaData:
    """Returns this Reader's current metadata.

    Returns:
        A sniffed metadata instance.
    """

    return self._sniffer.metadata(self.header, self.poll, self.exclude)

tab(columns=None, **tabs)

Set the Tabulator instance that will filter infile's rows & columns.

A tabulator is a container of tab instances that when called on a row, sequentially applies each tab to that row. Additionally after applying the row tabs it filters the result by columns. Implementation details may be found in the tabbed.tabs module.

Parameters:

Name Type Description Default
columns Optional[List[str] | List[int] | Pattern]

Columns in each row to return during reading as a list of string names, a list of column indices or a compiled regular expression pattern to match against header names. If None, all the columns in the header will be read during a read call.

None
tabs CellType | Sequence[CellType] | Pattern | Callable[[Dict[str, CellType], str], bool]

name = value keyword argument pairs where name is a valid header column name and value may be of type string, int, float, complex, time, date, datetime, regular expression or callable.

  • If a string type with rich comparison(s) is provided, a comparison tab is constructed.
  • If a string, int, float, complex, time, date or datetime is provided, an equality tab is constructed.
  • If a sequence is provided, a membership tab is constructed.
  • If a compiled re pattern, a Regex tab is constructed. See class docs for example.
{}

Returns:

Type Description
None

None

Source code in src/tabbed/reading.py
def tab(
    self,
    columns: Optional[List[str] | List[int] | re.Pattern] = None,
    **tabs: (
        CellType
        | Sequence[CellType]
        | re.Pattern
        | Callable[[Dict[str, CellType], str], bool]
    ),
) -> None:
    """Set the Tabulator instance that will filter infile's rows & columns.

    A tabulator is a container of tab instances that when called on a row,
    sequentially applies each tab to that row. Additionally after applying
    the row tabs it filters the result by columns. Implementation details
    may be found in the tabbed.tabs module.

    Args:
        columns:
            Columns in each row to return during reading as a list of string
            names, a list of column indices or a compiled regular expression
            pattern to match against header names. If None, all the columns
            in the header will be read during a read call.
        tabs:
            name = value keyword argument pairs where name is a valid header
            column name and value may be of type string, int, float,
            complex, time, date, datetime, regular expression or callable.

            - If a string type with rich comparison(s) is provided,
              a comparison tab is constructed.
            - If a string, int, float, complex, time, date  or datetime is
              provided, an equality tab is constructed.
            - If a sequence is provided, a membership tab is constructed.
            - If a compiled re pattern, a Regex tab is constructed. See
              class docs for example.

    Returns:
        None
    """

    self.tabulator = tabbing.Tabulator.from_keywords(
        self.header, columns, **tabs
    )

read(start=None, skips=None, indices=None, chunksize=int(200000.0), skip_empty=True, raise_ragged=False)

Iteratively read dictionary rows that satisfy this Reader's tabs.

Parameters:

Name Type Description Default
start Optional[int]

A line number from the start of the file to begin reading data from. If None and this reader's header has a line number, the line following the header is the start. If None and the header line number is None, the line following the last line in the metadata is the start. If None and there is no header or metadata, the start line is 0.

None
skips Optional[Sequence[int]]

A sequence of line numbers to skip during reading.

None
indices Optional[Sequence]

A sequence of line numbers to read rows from. If None. all rows from start not in skips will be read. If attempting to read a slice of a file a range instance may be provided and will have improved performance over other sequence types like lists.

None
chunksize int

The number of data lines to read for each yield. Lower values consume less memory. The default is 200,000 rows.

int(200000.0)
skip_empty bool

A boolean indicating if rows with no values between the delimiters should be skipped. Default is True.

True
raise_ragged bool

Boolean indicating if a row with more or fewer columns than expected should raise an error and stop reading. The default is False. Rows with fewer columns than the header will have None as the fill value. Rows with more columns than the header will have None keys.

False

Yields:

Type Description
List[Dict[str, CellType]]

Chunksize number of dictionary rows with header names as keys.

Raises:

Type Description
Error

A csv.Error is issued if a ragged row is encountered and raise_ragged is True. Casting problems do not raise errors but gracefully return strings when encountered.

Source code in src/tabbed/reading.py
def read(
    self,
    start: Optional[int] = None,
    skips: Optional[Sequence[int]] = None,
    indices: Optional[Sequence] = None,
    chunksize: int = int(2e5),
    skip_empty: bool = True,
    raise_ragged: bool = False,
) -> Iterator[List[Dict[str, CellType]]]:
    """Iteratively read dictionary rows that satisfy this Reader's tabs.

    Args:
        start:
            A line number from the start of the file to begin reading data
            from. If None and this reader's header has a line number, the
            line following the header is the start. If None and the header
            line number is None, the line following the last line in the
            metadata is the start. If None and there is no header or
            metadata, the start line is 0.
        skips:
            A sequence of line numbers to skip during reading.
        indices:
            A sequence of line numbers to read rows from. If None. all rows
            from start not in skips will be read. If attempting to read
            a slice of a file a range instance may be provided and will have
            improved performance over other sequence types like lists.
        chunksize:
            The number of data lines to read for each yield. Lower values
            consume less memory. The default is 200,000 rows.
        skip_empty:
            A boolean indicating if rows with no values between the
            delimiters should be skipped. Default is True.
        raise_ragged:
            Boolean indicating if a row with more or fewer columns than
            expected should raise an error and stop reading. The default is
            False. Rows with fewer columns than the header will have None
            as  the fill value. Rows with more columns than the header will
            have None keys.

    Yields:
        Chunksize number of dictionary rows with header names as keys.

    Raises:
        csv.Error:
            A csv.Error is issued if a ragged row is encountered and
            raise_ragged is True. Casting problems do not raise errors but
            gracefully return strings when encountered.
    """

    skips = [] if not skips else skips

    # poll types & formats, inconsistencies will trigger casting error log
    types, _ = self._sniffer.types(self.poll, self.exclude)
    formats, _ = self._sniffer.datetime_formats(self.poll, self.exclude)
    castings = dict(zip(self.header.names, zip(types, formats)))

    # initialize casting and ragged row errors
    self.errors.casting = []
    self.errors.ragged = []

    # construct a row iterator
    row_iter, row_start = self._prime(start, indices)

    fifo: Deque[Dict[str, CellType]] = deque()
    for line, dic in enumerate(row_iter, row_start):

        if line in skips:
            continue

        if indices and line not in indices:
            continue

        if not any(dic.values()) and skip_empty:
            continue

        # chk & log raggedness
        dic = self._log_ragged(line, dic, raise_ragged)

        # perform casts, log errors & filter with tabulator
        arow = {}
        for name, astr in dic.items():

            casting, fmt = castings[name]
            try:
                arow[name] = parsing.convert(astr, casting, fmt)
            except (ValueError, OverflowError, TypeError):
                # on exception leave astr unconverted & log casting error
                msg = f"line = {line}, column = '{name}'"
                self.errors.casting.append(msg)
                arow[name] = astr

        # apply tabs to filter row
        row = self.tabulator(arow)

        if row:
            fifo.append(row)

        if len(fifo) >= chunksize:
            yield [fifo.popleft() for _ in range(chunksize)]

    yield list(fifo)
    self.infile.seek(0)

peek(count=10)

Prints count number of lines from the first line of the file.

This method can be used to ensure this Reader identifies the correct metadata, header and data start locations.

Parameters:

Name Type Description Default
count int

The number of lines to print.

10

Returns:

Type Description
None

None

Source code in src/tabbed/reading.py
def peek(self, count: int = 10) -> None:
    """Prints count number of lines from the first line of the file.

    This method can be used to ensure this Reader identifies the correct
    metadata, header and data start locations.

    Args:
        count:
            The number of lines to print.

    Returns:
        None
    """

    cnt = 0
    while cnt < count:
        CRED = '\033[91m'
        CEND = '\033[0m'
        print(CRED + f'{cnt}' + CEND, next(self.infile).rstrip())
        cnt += 1

    self.infile.seek(0)

close()

Closes this Reader's infile resource.

Source code in src/tabbed/reading.py
def close(self):
    """Closes this Reader's infile resource."""

    self.infile.close()