Source code for ww.tools.strings

# coding: utf-8

"""
    :doc:`s() </string_wrapper>` is very convenient, but it's only a
    thin wrapper on top of regular strings and the tools from this module.

    So if you want to apply some of the goodies from it without having to
    turn your strings into StringWrapper objects, you can use the functions
    from this module directly.

    They don't accept bytes as an input. If you do so and it works, you must
    know it's not a supported behavior and may change in the future. Only
    pass:

    - unicode objects in Python 2;
    - str objects in Python 3.

    Example:

        >>> from ww.tools.strings import multisplit  # same as s().split()
        >>> string = u'a,b;c/d=a,b;c/d'
        >>> chunks = multisplit(string, u',', u';', u'[/=]', maxsplit=4)
        >>> for chunk in chunks: print(chunk)
        a
        b
        c
        d
        a,b;c/d

    You'll find bellow the detailed documentation for each functions of.
    Go have a look, there is some great stuff here!
"""

from __future__ import absolute_import, division, print_function

import re

from past.builtins import basestring

import ww
from ww.utils import require_positive_number, ensure_tuple
from ww.types import unicode, str_istr, str_istr_icallable, C, I  # noqa

REGEX_FLAGS = {
    'm': re.MULTILINE,
    'x': re.VERBOSE,
    'v': re.VERBOSE,
    's': re.DOTALL,
    '.': re.DOTALL,
    'd': re.DEBUG,
    'i': re.IGNORECASE,
    'u': re.UNICODE,
    'l': re.LOCALE,
}

try:
    # Python2 doesn't support re.ASCII flag
    REGEX_FLAGS['a'] = re.ASCII
except AttributeError:  # pragma: no cover
    pass


def parse_re_flags(flags):
    bflags = 0
    if isinstance(flags, basestring):
        for flag in flags:
            bflags |= REGEX_FLAGS[flag]

        return bflags

    return flags


# kwargs allow compatibiliy with 2.7 and 3 since you can't use
# keyword-only arguments in python 2
# TODO: remove empty strings
[docs]def multisplit(string, # type: unicode *separators, # type: unicode **kwargs # type: Union[unicode, C[..., I[unicode]]] ): # type: (...) -> I """ Like unicode.split, but accept several separators and regexes Args: string: the string to split. separators: strings you can split on. Each string can be a regex. maxsplit: max number of time you wish to split. default is 0, which means no limit. flags: flags you wish to pass if you use regexes. You should pass them as a string containing a combination of: - 'm' for re.MULTILINE - 'x' for re.VERBOSE - 'v' for re.VERBOSE - 's' for re.DOTALL - '.' for re.DOTALL - 'd' for re.DEBUG - 'i' for re.IGNORECASE - 'u' for re.UNICODE - 'l' for re.LOCALE cast: what to cast the result to Returns: An iterable of substrings. Raises: ValueError: if you pass a flag without separators. TypeError: if you pass something else than unicode strings. Example: >>> for word in multisplit(u'fat black cat, big'): print(word) fat black cat, big >>> string = u'a,b;c/d=a,b;c/d' >>> chunks = multisplit(string, u',', u';', u'[/=]', maxsplit=4) >>> for chunk in chunks: print(chunk) a b c d a,b;c/d """ cast = kwargs.pop('cast', list) flags = parse_re_flags(kwargs.get('flags', 0)) # 0 means "no limit" for re.split maxsplit = require_positive_number(kwargs.get('maxsplit', 0), 'maxsplit') # no separator means we use the default unicode.split behavior if not separators: if flags: raise ValueError(ww.s >> """ You can't pass flags without passing a separator. Flags only have sense if you split using a regex. """) maxsplit = maxsplit or -1 # -1 means "no limit" for unicode.split return unicode.split(string, None, maxsplit) # Check that all separators are strings for i, sep in enumerate(separators): if not isinstance(sep, unicode): raise TypeError(ww.s >> """ '{!r}', the separator at index '{}', is of type '{}'. multisplit() only accepts unicode strings. """.format(sep, i, type(sep))) # TODO: split let many empty strings in the result. Fix it. seps = list(separators) # cast to list so we can slice it # simple code for when you need to split the whole string if maxsplit == 0: return cast(_split(string, seps, flags)) # slow implementation with checks for recursive maxsplit return cast(_split_with_max(string, seps, maxsplit, flags))
def _split(string, separators, flags=0): try: sep = separators.pop() except IndexError: yield string else: # recursive split until we got the smallest chunks for chunk in re.split(sep, string, flags=flags): for item in _split(chunk, separators, flags=flags): yield item def _split_with_max(string, separators, maxsplit, flags=0): try: sep = separators.pop() except IndexError: yield string else: while True: if maxsplit <= 0: yield string break # we split only in 2, then recursively head first to get the rest res = re.split(sep, string, maxsplit=1, flags=flags) if len(res) < 2: yield string # Nothing to split anymore, we never reached maxsplit but we # can exit anyway break head, tail = res chunks = _split_with_max(head, separators, maxsplit=maxsplit, flags=flags) for chunk in chunks: # remove chunks from maxsplit yield chunk maxsplit -= 1 string = tail
[docs]def multireplace(string, # type: unicode patterns, # type: str_or_str_iterable substitutions, # type: str_istr_icallable maxreplace=0, # type: int flags=0 # type: unicode ): # type: (...) -> bool """ Like unicode.replace() but accept several substitutions and regexes Args: string: the string to split on. patterns: a string, or an iterable of strings to be replaced. substitutions: a string or an iterable of string to use as a replacement. You can pass either one string, or an iterable containing the same number of sustitutions that you passed as patterns. You can also pass a callable instead of a string. It should expact a match object as a parameter. maxreplace: the max number of replacement to make. 0 is no limit, which is the default. flags: flags you wish to pass if you use regexes. You should pass them as a string containing a combination of: - 'm' for re.MULTILINE - 'x' for re.VERBOSE - 'v' for re.VERBOSE - 's' for re.DOTALL - '.' for re.DOTALL - 'd' for re.DEBUG - 'i' for re.IGNORECASE - 'u' for re.UNICODE - 'l' for re.LOCALE Returns: The string with replaced bits. Raises: ValueError: if you pass the wrong number of substitution. Example: >>> print(multireplace(u'a,b;c/d', (u',', u';', u'/'), u',')) a,b,c,d >>> print(multireplace(u'a1b33c-d', u'\d+', u',')) a,b,c-d >>> print(multireplace(u'a-1,b-3,3c-d', u',|-', u'', maxreplace=3)) a1b3,3c-d >>> def upper(match): ... return match.group().upper() ... >>> print(multireplace(u'a-1,b-3,3c-d', u'[ab]', upper)) A-1,B-3,3c-d """ # we can pass either a string or an iterable of strings patterns = ensure_tuple(patterns) substitutions = ensure_tuple(substitutions) # you can either have: # - many patterns, one substitution # - many patterns, exactly as many substitutions # anything else is an error num_of_subs = len(substitutions) num_of_patterns = len(patterns) if num_of_subs == 1 and num_of_patterns > 0: substitutions *= num_of_patterns elif len(patterns) != num_of_subs: raise ValueError("You must have exactly one substitution " "for each pattern or only one substitution") flags = parse_re_flags(flags) # no limit for replacing, use a simple code if not maxreplace: for pattern, sub in zip(patterns, substitutions): string, count = re.subn(pattern, sub, string, flags=flags) return string # ensure we respect the max number of replace accross substitutions for pattern, sub in zip(patterns, substitutions): string, count = re.subn(pattern, sub, string, count=maxreplace, flags=flags) maxreplace -= count if maxreplace == 0: break return string