up follow livre

This commit is contained in:
Tykayn 2025-08-30 18:14:14 +02:00 committed by tykayn
parent b4b4398bb0
commit 3a7a3849ae
12242 changed files with 2564461 additions and 6914 deletions

View file

@ -0,0 +1,90 @@
"""
================================
Datasets (:mod:`scipy.datasets`)
================================
.. currentmodule:: scipy.datasets
Dataset Methods
===============
.. autosummary::
:toctree: generated/
ascent
face
electrocardiogram
Utility Methods
===============
.. autosummary::
:toctree: generated/
download_all -- Download all the dataset files to specified path.
clear_cache -- Clear cached dataset directory.
Usage of Datasets
=================
SciPy dataset methods can be simply called as follows: ``'<dataset-name>()'``
This downloads the dataset files over the network once, and saves the cache,
before returning a `numpy.ndarray` object representing the dataset.
Note that the return data structure and data type might be different for
different dataset methods. For a more detailed example on usage, please look
into the particular dataset method documentation above.
How dataset retrieval and storage works
=======================================
SciPy dataset files are stored within individual GitHub repositories under the
SciPy GitHub organization, following a naming convention as
``'dataset-<name>'``, for example `scipy.datasets.face` files live at
https://github.com/scipy/dataset-face. The `scipy.datasets` submodule utilizes
and depends on `Pooch <https://www.fatiando.org/pooch/latest/>`_, a Python
package built to simplify fetching data files. Pooch uses these repos to
retrieve the respective dataset files when calling the dataset function.
A registry of all the datasets, essentially a mapping of filenames with their
SHA256 hash and repo urls are maintained, which Pooch uses to handle and verify
the downloads on function call. After downloading the dataset once, the files
are saved in the system cache directory under ``'scipy-data'``.
Dataset cache locations may vary on different platforms.
For macOS::
'~/Library/Caches/scipy-data'
For Linux and other Unix-like platforms::
'~/.cache/scipy-data' # or the value of the XDG_CACHE_HOME env var, if defined
For Windows::
'C:\\Users\\<user>\\AppData\\Local\\<AppAuthor>\\scipy-data\\Cache'
In environments with constrained network connectivity for various security
reasons or on systems without continuous internet connections, one may manually
load the cache of the datasets by placing the contents of the dataset repo in
the above mentioned cache directory to avoid fetching dataset errors without
the internet connectivity.
"""
from ._fetchers import face, ascent, electrocardiogram
from ._download_all import download_all
from ._utils import clear_cache
__all__ = ['ascent', 'electrocardiogram', 'face',
'download_all', 'clear_cache']
from scipy._lib._testutils import PytestTester
test = PytestTester(__name__)
del PytestTester

View file

@ -0,0 +1,71 @@
"""
Platform independent script to download all the
`scipy.datasets` module data files.
This doesn't require a full scipy build.
Run: python _download_all.py <download_dir>
"""
import argparse
try:
import pooch
except ImportError:
pooch = None
if __package__ is None or __package__ == '':
# Running as python script, use absolute import
import _registry # type: ignore
else:
# Running as python module, use relative import
from . import _registry
def download_all(path=None):
"""
Utility method to download all the dataset files
for `scipy.datasets` module.
Parameters
----------
path : str, optional
Directory path to download all the dataset files.
If None, default to the system cache_dir detected by pooch.
Examples
--------
Download the datasets to the default cache location:
>>> from scipy import datasets
>>> datasets.download_all()
Download the datasets to the current directory:
>>> datasets.download_all(".")
"""
if pooch is None:
raise ImportError("Missing optional dependency 'pooch' required "
"for scipy.datasets module. Please use pip or "
"conda to install 'pooch'.")
if path is None:
path = pooch.os_cache('scipy-data')
# https://github.com/scipy/scipy/issues/21879
downloader = pooch.HTTPDownloader(headers={"User-Agent": "SciPy"})
for dataset_name, dataset_hash in _registry.registry.items():
pooch.retrieve(url=_registry.registry_urls[dataset_name],
known_hash=dataset_hash,
fname=dataset_name, path=path, downloader=downloader)
def main():
parser = argparse.ArgumentParser(description='Download SciPy data files.')
parser.add_argument("path", nargs='?', type=str,
default=pooch.os_cache('scipy-data'),
help="Directory path to download all the data files.")
args = parser.parse_args()
download_all(args.path)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,225 @@
import sys
from numpy import array, frombuffer, load
from ._registry import registry, registry_urls
try:
import pooch
except ImportError:
pooch = None
data_fetcher = None
else:
data_fetcher = pooch.create(
# Use the default cache folder for the operating system
# Pooch uses appdirs (https://github.com/ActiveState/appdirs) to
# select an appropriate directory for the cache on each platform.
path=pooch.os_cache("scipy-data"),
# The remote data is on Github
# base_url is a required param, even though we override this
# using individual urls in the registry.
base_url="https://github.com/scipy/",
registry=registry,
urls=registry_urls
)
def fetch_data(dataset_name, data_fetcher=data_fetcher):
if data_fetcher is None:
raise ImportError("Missing optional dependency 'pooch' required "
"for scipy.datasets module. Please use pip or "
"conda to install 'pooch'.")
# https://github.com/scipy/scipy/issues/21879
downloader = pooch.HTTPDownloader(
headers={"User-Agent": f"SciPy {sys.modules['scipy'].__version__}"}
)
# The "fetch" method returns the full path to the downloaded data file.
return data_fetcher.fetch(dataset_name, downloader=downloader)
def ascent():
"""
Get an 8-bit grayscale bit-depth, 512 x 512 derived image for easy
use in demos.
The image is derived from
https://pixnio.com/people/accent-to-the-top
Parameters
----------
None
Returns
-------
ascent : ndarray
convenient image to use for testing and demonstration
Examples
--------
>>> import scipy.datasets
>>> ascent = scipy.datasets.ascent()
>>> ascent.shape
(512, 512)
>>> ascent.max()
np.uint8(255)
>>> import matplotlib.pyplot as plt
>>> plt.gray()
>>> plt.imshow(ascent)
>>> plt.show()
"""
import pickle
# The file will be downloaded automatically the first time this is run,
# returning the path to the downloaded file. Afterwards, Pooch finds
# it in the local cache and doesn't repeat the download.
fname = fetch_data("ascent.dat")
# Now we just need to load it with our standard Python tools.
with open(fname, 'rb') as f:
ascent = array(pickle.load(f))
return ascent
def electrocardiogram():
"""
Load an electrocardiogram as an example for a 1-D signal.
The returned signal is a 5 minute long electrocardiogram (ECG), a medical
recording of the heart's electrical activity, sampled at 360 Hz.
Returns
-------
ecg : ndarray
The electrocardiogram in millivolt (mV) sampled at 360 Hz.
Notes
-----
The provided signal is an excerpt (19:35 to 24:35) from the `record 208`_
(lead MLII) provided by the MIT-BIH Arrhythmia Database [1]_ on
PhysioNet [2]_. The excerpt includes noise induced artifacts, typical
heartbeats as well as pathological changes.
.. _record 208: https://physionet.org/physiobank/database/html/mitdbdir/records.htm#208
.. versionadded:: 1.1.0
References
----------
.. [1] Moody GB, Mark RG. The impact of the MIT-BIH Arrhythmia Database.
IEEE Eng in Med and Biol 20(3):45-50 (May-June 2001).
(PMID: 11446209); :doi:`10.13026/C2F305`
.. [2] Goldberger AL, Amaral LAN, Glass L, Hausdorff JM, Ivanov PCh,
Mark RG, Mietus JE, Moody GB, Peng C-K, Stanley HE. PhysioBank,
PhysioToolkit, and PhysioNet: Components of a New Research Resource
for Complex Physiologic Signals. Circulation 101(23):e215-e220;
:doi:`10.1161/01.CIR.101.23.e215`
Examples
--------
>>> from scipy.datasets import electrocardiogram
>>> ecg = electrocardiogram()
>>> ecg
array([-0.245, -0.215, -0.185, ..., -0.405, -0.395, -0.385], shape=(108000,))
>>> ecg.shape, ecg.mean(), ecg.std()
((108000,), -0.16510875, 0.5992473991177294)
As stated the signal features several areas with a different morphology.
E.g., the first few seconds show the electrical activity of a heart in
normal sinus rhythm as seen below.
>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> fs = 360
>>> time = np.arange(ecg.size) / fs
>>> plt.plot(time, ecg)
>>> plt.xlabel("time in s")
>>> plt.ylabel("ECG in mV")
>>> plt.xlim(9, 10.2)
>>> plt.ylim(-1, 1.5)
>>> plt.show()
After second 16, however, the first premature ventricular contractions,
also called extrasystoles, appear. These have a different morphology
compared to typical heartbeats. The difference can easily be observed
in the following plot.
>>> plt.plot(time, ecg)
>>> plt.xlabel("time in s")
>>> plt.ylabel("ECG in mV")
>>> plt.xlim(46.5, 50)
>>> plt.ylim(-2, 1.5)
>>> plt.show()
At several points large artifacts disturb the recording, e.g.:
>>> plt.plot(time, ecg)
>>> plt.xlabel("time in s")
>>> plt.ylabel("ECG in mV")
>>> plt.xlim(207, 215)
>>> plt.ylim(-2, 3.5)
>>> plt.show()
Finally, examining the power spectrum reveals that most of the biosignal is
made up of lower frequencies. At 60 Hz the noise induced by the mains
electricity can be clearly observed.
>>> from scipy.signal import welch
>>> f, Pxx = welch(ecg, fs=fs, nperseg=2048, scaling="spectrum")
>>> plt.semilogy(f, Pxx)
>>> plt.xlabel("Frequency in Hz")
>>> plt.ylabel("Power spectrum of the ECG in mV**2")
>>> plt.xlim(f[[0, -1]])
>>> plt.show()
"""
fname = fetch_data("ecg.dat")
with load(fname) as file:
ecg = file["ecg"].astype(int) # np.uint16 -> int
# Convert raw output of ADC to mV: (ecg - adc_zero) / adc_gain
ecg = (ecg - 1024) / 200.0
return ecg
def face(gray=False):
"""
Get a 1024 x 768, color image of a raccoon face.
The image is derived from
https://pixnio.com/fauna-animals/raccoons/raccoon-procyon-lotor
Parameters
----------
gray : bool, optional
If True return 8-bit grey-scale image, otherwise return a color image
Returns
-------
face : ndarray
image of a raccoon face
Examples
--------
>>> import scipy.datasets
>>> face = scipy.datasets.face()
>>> face.shape
(768, 1024, 3)
>>> face.max()
np.uint8(255)
>>> import matplotlib.pyplot as plt
>>> plt.gray()
>>> plt.imshow(face)
>>> plt.show()
"""
import bz2
fname = fetch_data("face.dat")
with open(fname, 'rb') as f:
rawdata = f.read()
face_data = bz2.decompress(rawdata)
face = frombuffer(face_data, dtype='uint8')
face.shape = (768, 1024, 3)
if gray is True:
face = (0.21 * face[:, :, 0] + 0.71 * face[:, :, 1] +
0.07 * face[:, :, 2]).astype('uint8')
return face

View file

@ -0,0 +1,26 @@
##########################################################################
# This file serves as the dataset registry for SciPy Datasets SubModule.
##########################################################################
# To generate the SHA256 hash, use the command
# openssl sha256 <filename>
registry = {
"ascent.dat": "03ce124c1afc880f87b55f6b061110e2e1e939679184f5614e38dacc6c1957e2",
"ecg.dat": "f20ad3365fb9b7f845d0e5c48b6fe67081377ee466c3a220b7f69f35c8958baf",
"face.dat": "9d8b0b4d081313e2b485748c770472e5a95ed1738146883d84c7030493e82886"
}
registry_urls = {
"ascent.dat": "https://raw.githubusercontent.com/scipy/dataset-ascent/main/ascent.dat",
"ecg.dat": "https://raw.githubusercontent.com/scipy/dataset-ecg/main/ecg.dat",
"face.dat": "https://raw.githubusercontent.com/scipy/dataset-face/main/face.dat"
}
# dataset method mapping with their associated filenames
# <method_name> : ["filename1", "filename2", ...]
method_files_map = {
"ascent": ["ascent.dat"],
"electrocardiogram": ["ecg.dat"],
"face": ["face.dat"]
}

View file

@ -0,0 +1,81 @@
import os
import shutil
from ._registry import method_files_map
try:
import platformdirs
except ImportError:
platformdirs = None # type: ignore[assignment]
def _clear_cache(datasets, cache_dir=None, method_map=None):
if method_map is None:
# Use SciPy Datasets method map
method_map = method_files_map
if cache_dir is None:
# Use default cache_dir path
if platformdirs is None:
# platformdirs is pooch dependency
raise ImportError("Missing optional dependency 'pooch' required "
"for scipy.datasets module. Please use pip or "
"conda to install 'pooch'.")
cache_dir = platformdirs.user_cache_dir("scipy-data")
if not os.path.exists(cache_dir):
print(f"Cache Directory {cache_dir} doesn't exist. Nothing to clear.")
return
if datasets is None:
print(f"Cleaning the cache directory {cache_dir}!")
shutil.rmtree(cache_dir)
else:
if not isinstance(datasets, list | tuple):
# single dataset method passed should be converted to list
datasets = [datasets, ]
for dataset in datasets:
assert callable(dataset)
dataset_name = dataset.__name__ # Name of the dataset method
if dataset_name not in method_map:
raise ValueError(f"Dataset method {dataset_name} doesn't "
"exist. Please check if the passed dataset "
"is a subset of the following dataset "
f"methods: {list(method_map.keys())}")
data_files = method_map[dataset_name]
data_filepaths = [os.path.join(cache_dir, file)
for file in data_files]
for data_filepath in data_filepaths:
if os.path.exists(data_filepath):
print("Cleaning the file "
f"{os.path.split(data_filepath)[1]} "
f"for dataset {dataset_name}")
os.remove(data_filepath)
else:
print(f"Path {data_filepath} doesn't exist. "
"Nothing to clear.")
def clear_cache(datasets=None):
"""
Cleans the scipy datasets cache directory.
If a scipy.datasets method or a list/tuple of the same is
provided, then clear_cache removes all the data files
associated to the passed dataset method callable(s).
By default, it removes all the cached data files.
Parameters
----------
datasets : callable or list/tuple of callable or None
Examples
--------
>>> from scipy import datasets
>>> ascent_array = datasets.ascent()
>>> ascent_array.shape
(512, 512)
>>> datasets.clear_cache([datasets.ascent])
Cleaning the file ascent.dat for dataset ascent
"""
_clear_cache(datasets)

View file

@ -0,0 +1,128 @@
from scipy.datasets._registry import registry
from scipy.datasets._fetchers import data_fetcher
from scipy.datasets._utils import _clear_cache
from scipy.datasets import ascent, face, electrocardiogram, download_all
from numpy.testing import assert_equal, assert_almost_equal
import os
from threading import get_ident
import pytest
try:
import pooch
except ImportError:
raise ImportError("Missing optional dependency 'pooch' required "
"for scipy.datasets module. Please use pip or "
"conda to install 'pooch'.")
data_dir = data_fetcher.path # type: ignore
def _has_hash(path, expected_hash):
"""Check if the provided path has the expected hash."""
if not os.path.exists(path):
return False
return pooch.file_hash(path) == expected_hash
class TestDatasets:
@pytest.fixture(scope='module', autouse=True)
def test_download_all(self):
# This fixture requires INTERNET CONNECTION
# test_setup phase
download_all()
yield
@pytest.mark.fail_slow(10)
def test_existence_all(self):
assert len(os.listdir(data_dir)) >= len(registry)
def test_ascent(self):
assert_equal(ascent().shape, (512, 512))
# hash check
assert _has_hash(os.path.join(data_dir, "ascent.dat"),
registry["ascent.dat"])
def test_face(self):
assert_equal(face().shape, (768, 1024, 3))
# hash check
assert _has_hash(os.path.join(data_dir, "face.dat"),
registry["face.dat"])
def test_electrocardiogram(self):
# Test shape, dtype and stats of signal
ecg = electrocardiogram()
assert_equal(ecg.dtype, float)
assert_equal(ecg.shape, (108000,))
assert_almost_equal(ecg.mean(), -0.16510875)
assert_almost_equal(ecg.std(), 0.5992473991177294)
# hash check
assert _has_hash(os.path.join(data_dir, "ecg.dat"),
registry["ecg.dat"])
def test_clear_cache(tmp_path):
# Note: `tmp_path` is a pytest fixture, it handles cleanup
thread_basepath = tmp_path / str(get_ident())
thread_basepath.mkdir()
dummy_basepath = thread_basepath / "dummy_cache_dir"
dummy_basepath.mkdir()
# Create three dummy dataset files for dummy dataset methods
dummy_method_map = {}
for i in range(4):
dummy_method_map[f"data{i}"] = [f"data{i}.dat"]
data_filepath = dummy_basepath / f"data{i}.dat"
data_filepath.write_text("")
# clear files associated to single dataset method data0
# also test callable argument instead of list of callables
def data0():
pass
_clear_cache(datasets=data0, cache_dir=dummy_basepath,
method_map=dummy_method_map)
assert not os.path.exists(dummy_basepath/"data0.dat")
# clear files associated to multiple dataset methods "data3" and "data4"
def data1():
pass
def data2():
pass
_clear_cache(datasets=[data1, data2], cache_dir=dummy_basepath,
method_map=dummy_method_map)
assert not os.path.exists(dummy_basepath/"data1.dat")
assert not os.path.exists(dummy_basepath/"data2.dat")
# clear multiple dataset files "data3_0.dat" and "data3_1.dat"
# associated with dataset method "data3"
def data4():
pass
# create files
(dummy_basepath / "data4_0.dat").write_text("")
(dummy_basepath / "data4_1.dat").write_text("")
dummy_method_map["data4"] = ["data4_0.dat", "data4_1.dat"]
_clear_cache(datasets=[data4], cache_dir=dummy_basepath,
method_map=dummy_method_map)
assert not os.path.exists(dummy_basepath/"data4_0.dat")
assert not os.path.exists(dummy_basepath/"data4_1.dat")
# wrong dataset method should raise ValueError since it
# doesn't exist in the dummy_method_map
def data5():
pass
with pytest.raises(ValueError):
_clear_cache(datasets=[data5], cache_dir=dummy_basepath,
method_map=dummy_method_map)
# remove all dataset cache
_clear_cache(datasets=None, cache_dir=dummy_basepath)
assert not os.path.exists(dummy_basepath)