import io
import logging
import os
import re
from datetime import timedelta, datetime
from typing import List, Optional, Union
from speasy.core.cache import CacheCall
from speasy.core.cache import get_item, add_item, CacheItem
from . import http
from .url_utils import is_local_file
log = logging.getLogger(__name__)
_HREF_REGEX = re.compile(' href="([A-Za-z0-9-_.]+)">')
[docs]
class AnyFile(io.IOBase):
def __init__(self, url, file_impl: io.IOBase, status=200):
self._url = url
self._file_impl = file_impl
self._status = status
@property
def url(self):
return self._url
[docs]
def read(self, *args, **kwargs):
return self._file_impl.read(*args, **kwargs)
[docs]
def seek(self, *args, **kwargs):
return self._file_impl.seek(*args, **kwargs)
@property
def ok(self):
return (self._status in (200, 304)) and self._file_impl.readable()
@property
def status_code(self):
return self._status
def __del__(self):
if not self._file_impl.closed:
self.close()
def __getattr__(self, item):
return getattr(self._file_impl, item)
def _remote_open(url, timeout: int = http.DEFAULT_TIMEOUT, headers: dict = None, mode='rb'):
resp = http.urlopen(url=url, headers=headers, timeout=timeout)
if 'b' in mode:
return AnyFile(url, io.BytesIO(resp.bytes))
else:
return AnyFile(url, io.StringIO(resp.text))
def _make_file_from_cache_entry(entry: CacheItem, url: str, mode: str) -> AnyFile:
if 'b' in mode:
return AnyFile(url, io.BytesIO(entry.data))
else:
return AnyFile(url, io.StringIO(entry.data))
def _cache_remote_file(url, timeout: int = http.DEFAULT_TIMEOUT, headers: dict = None, mode='rb') -> AnyFile:
resp = http.urlopen(url=url, headers=headers, timeout=timeout)
if 'b' in mode:
entry = CacheItem(data=resp.bytes, version=resp.getheader('last-modified', str(datetime.now())))
else:
entry = CacheItem(data=resp.text, version=resp.getheader('last-modified', str(datetime.now())))
add_item(key=url, item=entry)
return _make_file_from_cache_entry(entry, url, mode)
[docs]
def any_loc_open(url, timeout: int = http.DEFAULT_TIMEOUT, headers: Optional[dict] = None, mode='rb',
cache_remote_files=False) -> AnyFile:
"""Opens a file at the specified URL, whether local or remote.
Parameters
----------
url : str
The file URL, formatted as either a local path or a standard URL (https://en.wikipedia.org/wiki/URL).
timeout : int
The timeout duration in seconds for remote files (default: 60 seconds).
headers : Optional[dict]
Optional HTTP headers to include when requesting remote files.
mode : str
The file open mode. Only 'r' or 'rb' are supported.
cache_remote_files : bool
Determines whether remote files are stored in the Speasy cache for future requests. Files are only downloaded
if they have changed (based on the 'last-modified' header field).
Returns
-------
AnyFile
The opened file object.
"""
if is_local_file(url):
return AnyFile(url, open(url.replace('file://', ''), mode=mode))
else:
if cache_remote_files:
last_modified = http.head(url).getheader('last-modified', str(datetime.now()))
cache_item: Optional[CacheItem] = get_item(url)
if cache_item is None or last_modified != cache_item.version:
return _cache_remote_file(url, timeout=timeout, headers=headers, mode=mode)
else:
return _make_file_from_cache_entry(cache_item, url, mode)
else:
return _remote_open(url, timeout=timeout, headers=headers, mode=mode)
def _list_local_files(path: str) -> List[str]:
return os.listdir(path)
@CacheCall(cache_retention=timedelta(hours=12), is_pure=True)
def _list_remote_files(url: str) -> List[str]:
response = http.get(url)
if response.ok:
return _HREF_REGEX.findall(response.text)
return []
[docs]
def list_files(url: str, file_regex: Union[re.Pattern, str]) -> List[str]:
"""Lists files that match the specified regex pattern either from a web page generated by Apache mod_dir or
equivalent, or from a local directory.
Parameters
----------
url : str
The URL or local path to scan.
file_regex : re.Pattern or str
The regular expression pattern used to filter files.
Returns
-------
List[str]
A list of files that match the specified regex pattern, either from a remote source or a local directory.
"""
if type(file_regex) is str:
file_regex = re.compile(file_regex)
if is_local_file(url):
files = _list_local_files(url.replace('file://', ''))
else:
files = _list_remote_files(url)
return list(filter(file_regex.match, files))