HEX

File: //lib/google-cloud-sdk/lib/googlecloudsdk/command_lib/storage/wildcard_iterator.py
# -*- coding: utf-8 -*- #
# Copyright 2020 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Utilities for expanding wildcarded GCS pathnames."""

from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals

import abc
import collections
import fnmatch
import glob
import os
import re

from googlecloudsdk.api_lib.storage import api_factory
from googlecloudsdk.api_lib.storage import cloud_api
from googlecloudsdk.api_lib.storage import errors as api_errors
from googlecloudsdk.command_lib.storage import errors
from googlecloudsdk.command_lib.storage import resource_reference
from googlecloudsdk.command_lib.storage import storage_url
import six


WILDCARD_REGEX = re.compile(r'[*?\[\]]')


def contains_wildcard(url_string):
  """Checks whether url_string contains a wildcard.

  Args:
    url_string: URL string to check.

  Returns:
    bool indicator.
  """
  return bool(WILDCARD_REGEX.search(url_string))


def get_wildcard_iterator(
    url_str, all_versions=False, fields_scope=cloud_api.FieldsScope.NO_ACL):
  """Instantiate a WildcardIterator for the given URL string.

  Args:
    url_str (str): URL string which may contain wildcard characters.
    all_versions (bool): If true, the iterator yields all versions of objects
        matching the wildcard.  If false, yields just the live object version.
    fields_scope (cloud_api.FieldsScope): Determines amount of metadata
        returned by API.
  Returns:
    A WildcardIterator object.
  """
  url = storage_url.storage_url_from_string(url_str)
  if isinstance(url, storage_url.CloudUrl):
    return CloudWildcardIterator(url, all_versions, fields_scope)
  elif isinstance(url, storage_url.FileUrl):
    return FileWildcardIterator(url)
  else:
    raise errors.InvalidUrlError('Unknown url type %s.' % url)


class WildcardIterator(six.with_metaclass(abc.ABCMeta)):
  """Class for iterating over Google Cloud Storage strings containing wildcards.

  The base class is abstract; you should instantiate using the
  wildcard_iterator() static factory method, which chooses the right
  implementation depending on the base string.
  """

  def __repr__(self):
    """Returns string representation of WildcardIterator."""
    return 'WildcardIterator(%s)' % self.wildcard_url.url_string


class FileWildcardIterator(WildcardIterator):
  """Class to iterate over files and directories."""

  def __init__(self, url):
    """Initialize FileWildcardIterator instance.

    Args:
      url (FileUrl): A FileUrl instance representing a file path.
    """
    super().__init__()
    self._path = url.object_name

  def __iter__(self):
    recursion_needed = '**' in self._path
    for path in glob.iglob(self._path, recursive=recursion_needed):
      # For pattern like foo/bar/**, glob returns first path as 'foo/bar/'
      # even when foo/bar does not exist. So we skip non-existing paths.
      # Glob also returns intermediate directories if called with **. We skip
      # them to be consistent with CloudWildcardIterator.
      if self._path.endswith('**') and (not os.path.exists(path)
                                        or os.path.isdir(path)):
        continue

      file_url = storage_url.FileUrl(path)
      if os.path.isdir(path):
        yield resource_reference.FileDirectoryResource(file_url)
      else:
        yield resource_reference.FileObjectResource(file_url)


class CloudWildcardIterator(WildcardIterator):
  """Class to iterate over Cloud Storage strings containing wildcards."""

  def __init__(
      self, url, all_versions=False, fields_scope=cloud_api.FieldsScope.NO_ACL):
    """Instantiates an iterator that matches the wildcard URL.

    Args:
      url (CloudUrl): CloudUrl that may contain wildcard that needs expansion.
      all_versions (bool): If true, the iterator yields all versions of objects
          matching the wildcard.  If false, yields just the live object version.
      fields_scope (cloud_api.FieldsScope): Determines amount of metadata
          returned by API.
    """
    super(CloudWildcardIterator, self).__init__()
    self._url = url
    self._all_versions = all_versions
    self._fields_scope = fields_scope
    self._client = api_factory.get_api(url.scheme)

    if url.url_string.endswith(url.delimiter):
      # Forces the API to return prefixes instead of their contents.
      url = storage_url.storage_url_from_string(
          storage_url.rstrip_one_delimiter(url.url_string))

  def __iter__(self):
    if self._url.is_provider():
      for bucket_resource in self._client.ListBuckets(self._fields_scope):
        yield bucket_resource
    else:
      for bucket_resource in self._fetch_buckets():
        if self._url.is_bucket():
          yield bucket_resource
        else:  # URL is an object or prefix.
          for obj_resource in self._fetch_objects(
              bucket_resource.storage_url.bucket_name):
            yield obj_resource

  def _fetch_objects(self, bucket_name):
    """Fetch all objects for the given bucket that match the URL."""
    if not contains_wildcard(self._url.object_name) and not self._all_versions:
      try:
        # Assume that the url represents a single object.
        return [self._client.GetObjectMetadata(
            bucket_name, self._url.object_name, self._url.generation,
            self._fields_scope)]
      except api_errors.NotFoundError:
        # Object does not exist. Could be a prefix.
        pass
    return self._expand_object_path(bucket_name)

  def _expand_object_path(self, bucket_name):
    """If wildcard, expand object names.

    Recursively expand each folder with wildcard.

    Args:
      bucket_name (str): Name of the bucket.

    Yields:
      resource_reference.Resource objects where each resource can be
      an ObjectResource object or a PrefixResource object.
    """
    # Retain original name to see if user wants only prefixes.
    original_object_name = self._url.object_name
    # Force API to return prefix resource not the prefix's contents.
    object_name = storage_url.rstrip_one_delimiter(original_object_name)

    names_needing_expansion = collections.deque([object_name])
    while names_needing_expansion:
      name = names_needing_expansion.popleft()

      # Parse out the prefix, delimiter, filter_pattern and suffix.
      # Given a string 'a/b*c/d/e*f/g.txt', this will return
      # CloudWildcardParts(prefix='a/b', filter_pattern='*c',
      #                    delimiter='/', suffix='d/e*f/g.txt')
      wildcard_parts = CloudWildcardParts.from_string(name, self._url.delimiter)

      # Fetch all the objects and prefixes.
      resource_iterator = self._client.ListObjects(
          all_versions=self._all_versions or bool(self._url.generation),
          bucket_name=bucket_name,
          delimiter=wildcard_parts.delimiter,
          fields_scope=self._fields_scope,
          prefix=wildcard_parts.prefix or None)

      # We have all the objects and prefixes that matched the
      # wildcard_parts.prefix. Use the filter_pattern to eliminate non-matching
      # objects and prefixes.
      filtered_resources = self._filter_resources(
          resource_iterator,
          wildcard_parts.prefix + wildcard_parts.filter_pattern)

      for resource in filtered_resources:
        if wildcard_parts.suffix:
          if isinstance(resource, resource_reference.PrefixResource):
            # Suffix is present, which indicates that we have more wildcards to
            # expand. Let's say object_name is a/b1c. Then the new string that
            # we want to expand will be a/b1c/d/e*f/g.txt
            names_needing_expansion.append(
                resource.storage_url.object_name  + wildcard_parts.suffix)
        else:
          # Make sure an object is not returned if the original query was for
          # a prefix.
          if (not resource.storage_url.object_name.endswith(self._url.delimiter)
              and original_object_name.endswith(self._url.delimiter)):
            continue
          yield resource

  def _filter_resources(self, resource_iterator, wildcard_pattern):
    """Filter out resources that do not match the wildcard_pattern.

    Args:
      resource_iterator (iterable): An iterable resource_reference.Resource
        objects.
      wildcard_pattern (str): The wildcard_pattern to filter the resources.

    Yields:
      resource_reference.Resource objects matching the wildcard_pattern
    """
    regex_string = fnmatch.translate(wildcard_pattern)
    regex_pattern = re.compile(regex_string)
    for resource in resource_iterator:
      # A prefix resource returned by the API will always end with a slash.
      # We strip the slash in the end to match cases like gs://bucket/folder1
      object_name = storage_url.rstrip_one_delimiter(
          resource.storage_url.object_name)
      if (self._url.generation and
          resource.storage_url.generation != self._url.generation):
        # Filter based on generation, if generation is present in the request.
        continue
      if regex_pattern.match(object_name):
        yield resource

  def _fetch_buckets(self):
    """Fetch the bucket(s) corresponding to the url.

    Returns:
      An iterable of BucketResource objects.
    """
    if contains_wildcard(self._url.bucket_name):
      return self._expand_bucket_wildcards(self._url.bucket_name)
    elif self._url.is_bucket():
      return  [self._client.GetBucket(
          self._url.bucket_name, self._fields_scope)]
    else:
      return [resource_reference.BucketResource(self._url)]

  def _expand_bucket_wildcards(self, bucket_name):
    """Expand bucket names with wildcard.

    Args:
      bucket_name (str): Bucket name with wildcard.

    Yields:
      BucketResource objects.
    """
    regex = fnmatch.translate(bucket_name)
    bucket_pattern = re.compile(regex)
    for bucket_resource in self._client.ListBuckets(self._fields_scope):
      if bucket_pattern.match(bucket_resource.name):
        yield bucket_resource


class CloudWildcardParts:
  """Different parts of the wildcard string used for querying and filtering."""

  def __init__(self, prefix, filter_pattern, delimiter, suffix):
    """Initialize the CloudWildcardParts object.

    Args:
      prefix (str): The prefix string to be passed to the API request.
        This is the substring before the first occurrance of the wildcard.
      filter_pattern (str): The pattern to be used to filter out the results
        returned by the ListObjects call. This is a substring starting from
        the first occurance of the wildcard upto the first delimiter.
      delimiter (str): The delimiter to be passed to the api request.
      suffix (str): The substirng after the first delimiter in the wildcard.
    """
    self.prefix = prefix
    self.filter_pattern = filter_pattern
    self.delimiter = delimiter
    self.suffix = suffix

  @classmethod
  def from_string(cls, string, delimiter=storage_url.CloudUrl.CLOUD_URL_DELIM):
    """Create a CloudWildcardParts instance from a string.

    Args:
      string (str): String that needs to be splitted into different parts.
      delimiter (str): The delimiter to be used for splitting the string.

    Returns:
      WildcardParts object.
    """
    # Let's assume name => "a/b/c/d*e/f/g*.txt".
    # prefix => "a/b/c/d", wildcard_string => "*e/f/g*.txt".
    prefix, wildcard_string = _split_on_wildcard(string)
    # We expand one level at a time. Hence, spliting on delimiter.
    # filter_pattern => "*e", suffix = "f/g*.txt".
    filter_pattern, _, suffix = wildcard_string.partition(delimiter)

    if '**' in filter_pattern:
      # Fetch all objects for ** pattern. No delimiter is required since we
      # want to fetch all the objects here.
      delimiter = None
      filter_pattern = wildcard_string
      # Since we have fetched all the objects, suffix is no longer required.
      suffix = None

    return cls(prefix, filter_pattern, delimiter, suffix)


def _split_on_wildcard(string):
  """Split the string into two such that first part does not have any wildcard.

  Args:
    string (str): The string to be split.

  Returns:
    A 2-tuple where first part doesn't have any wildcard, and second part does
    have a wildcard. If wildcard is not found, the second part is empty.
    If string starts with a wildcard then first part is empty.
    For example:
      _split_on_wildcard('a/b/c/d*e/f/*.txt') => ('a/b/c/d', '*e/f/*.txt')
      _split_on_wildcard('*e/f/*.txt') => ('', '*e/f/*.txt')
      _split_on_wildcard('a/b/c/d') => ('a/b/c/d', '')
  """
  match = WILDCARD_REGEX.search(string)
  if match is None:
    return string, ''
  first_wildcard_idx = match.start()
  prefix = string[:first_wildcard_idx]
  wildcard_str = string[first_wildcard_idx:]
  return prefix, wildcard_str