up follow livre
This commit is contained in:
parent
70a5c3465c
commit
cffb31c1ef
12198 changed files with 2562132 additions and 35 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,748 @@
|
|||
from collections.abc import Generator
|
||||
from contextlib import contextmanager
|
||||
import re
|
||||
import struct
|
||||
import tracemalloc
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import hashtable as ht
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.core.algorithms import isin
|
||||
|
||||
|
||||
@contextmanager
|
||||
def activated_tracemalloc() -> Generator[None, None, None]:
|
||||
tracemalloc.start()
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
tracemalloc.stop()
|
||||
|
||||
|
||||
def get_allocated_khash_memory():
|
||||
snapshot = tracemalloc.take_snapshot()
|
||||
snapshot = snapshot.filter_traces(
|
||||
(tracemalloc.DomainFilter(True, ht.get_hashtable_trace_domain()),)
|
||||
)
|
||||
return sum(x.size for x in snapshot.traces)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"table_type, dtype",
|
||||
[
|
||||
(ht.PyObjectHashTable, np.object_),
|
||||
(ht.Complex128HashTable, np.complex128),
|
||||
(ht.Int64HashTable, np.int64),
|
||||
(ht.UInt64HashTable, np.uint64),
|
||||
(ht.Float64HashTable, np.float64),
|
||||
(ht.Complex64HashTable, np.complex64),
|
||||
(ht.Int32HashTable, np.int32),
|
||||
(ht.UInt32HashTable, np.uint32),
|
||||
(ht.Float32HashTable, np.float32),
|
||||
(ht.Int16HashTable, np.int16),
|
||||
(ht.UInt16HashTable, np.uint16),
|
||||
(ht.Int8HashTable, np.int8),
|
||||
(ht.UInt8HashTable, np.uint8),
|
||||
(ht.IntpHashTable, np.intp),
|
||||
],
|
||||
)
|
||||
class TestHashTable:
|
||||
def test_get_set_contains_len(self, table_type, dtype):
|
||||
index = 5
|
||||
table = table_type(55)
|
||||
assert len(table) == 0
|
||||
assert index not in table
|
||||
|
||||
table.set_item(index, 42)
|
||||
assert len(table) == 1
|
||||
assert index in table
|
||||
assert table.get_item(index) == 42
|
||||
|
||||
table.set_item(index + 1, 41)
|
||||
assert index in table
|
||||
assert index + 1 in table
|
||||
assert len(table) == 2
|
||||
assert table.get_item(index) == 42
|
||||
assert table.get_item(index + 1) == 41
|
||||
|
||||
table.set_item(index, 21)
|
||||
assert index in table
|
||||
assert index + 1 in table
|
||||
assert len(table) == 2
|
||||
assert table.get_item(index) == 21
|
||||
assert table.get_item(index + 1) == 41
|
||||
assert index + 2 not in table
|
||||
|
||||
table.set_item(index + 1, 21)
|
||||
assert index in table
|
||||
assert index + 1 in table
|
||||
assert len(table) == 2
|
||||
assert table.get_item(index) == 21
|
||||
assert table.get_item(index + 1) == 21
|
||||
|
||||
with pytest.raises(KeyError, match=str(index + 2)):
|
||||
table.get_item(index + 2)
|
||||
|
||||
def test_get_set_contains_len_mask(self, table_type, dtype):
|
||||
if table_type == ht.PyObjectHashTable:
|
||||
pytest.skip("Mask not supported for object")
|
||||
index = 5
|
||||
table = table_type(55, uses_mask=True)
|
||||
assert len(table) == 0
|
||||
assert index not in table
|
||||
|
||||
table.set_item(index, 42)
|
||||
assert len(table) == 1
|
||||
assert index in table
|
||||
assert table.get_item(index) == 42
|
||||
with pytest.raises(KeyError, match="NA"):
|
||||
table.get_na()
|
||||
|
||||
table.set_item(index + 1, 41)
|
||||
table.set_na(41)
|
||||
assert pd.NA in table
|
||||
assert index in table
|
||||
assert index + 1 in table
|
||||
assert len(table) == 3
|
||||
assert table.get_item(index) == 42
|
||||
assert table.get_item(index + 1) == 41
|
||||
assert table.get_na() == 41
|
||||
|
||||
table.set_na(21)
|
||||
assert index in table
|
||||
assert index + 1 in table
|
||||
assert len(table) == 3
|
||||
assert table.get_item(index + 1) == 41
|
||||
assert table.get_na() == 21
|
||||
assert index + 2 not in table
|
||||
|
||||
with pytest.raises(KeyError, match=str(index + 2)):
|
||||
table.get_item(index + 2)
|
||||
|
||||
def test_map_keys_to_values(self, table_type, dtype, writable):
|
||||
# only Int64HashTable has this method
|
||||
if table_type == ht.Int64HashTable:
|
||||
N = 77
|
||||
table = table_type()
|
||||
keys = np.arange(N).astype(dtype)
|
||||
vals = np.arange(N).astype(np.int64) + N
|
||||
keys.flags.writeable = writable
|
||||
vals.flags.writeable = writable
|
||||
table.map_keys_to_values(keys, vals)
|
||||
for i in range(N):
|
||||
assert table.get_item(keys[i]) == i + N
|
||||
|
||||
def test_map_locations(self, table_type, dtype, writable):
|
||||
N = 8
|
||||
table = table_type()
|
||||
keys = (np.arange(N) + N).astype(dtype)
|
||||
keys.flags.writeable = writable
|
||||
table.map_locations(keys)
|
||||
for i in range(N):
|
||||
assert table.get_item(keys[i]) == i
|
||||
|
||||
def test_map_locations_mask(self, table_type, dtype, writable):
|
||||
if table_type == ht.PyObjectHashTable:
|
||||
pytest.skip("Mask not supported for object")
|
||||
N = 3
|
||||
table = table_type(uses_mask=True)
|
||||
keys = (np.arange(N) + N).astype(dtype)
|
||||
keys.flags.writeable = writable
|
||||
table.map_locations(keys, np.array([False, False, True]))
|
||||
for i in range(N - 1):
|
||||
assert table.get_item(keys[i]) == i
|
||||
|
||||
with pytest.raises(KeyError, match=re.escape(str(keys[N - 1]))):
|
||||
table.get_item(keys[N - 1])
|
||||
|
||||
assert table.get_na() == 2
|
||||
|
||||
def test_lookup(self, table_type, dtype, writable):
|
||||
N = 3
|
||||
table = table_type()
|
||||
keys = (np.arange(N) + N).astype(dtype)
|
||||
keys.flags.writeable = writable
|
||||
table.map_locations(keys)
|
||||
result = table.lookup(keys)
|
||||
expected = np.arange(N)
|
||||
tm.assert_numpy_array_equal(result.astype(np.int64), expected.astype(np.int64))
|
||||
|
||||
def test_lookup_wrong(self, table_type, dtype):
|
||||
if dtype in (np.int8, np.uint8):
|
||||
N = 100
|
||||
else:
|
||||
N = 512
|
||||
table = table_type()
|
||||
keys = (np.arange(N) + N).astype(dtype)
|
||||
table.map_locations(keys)
|
||||
wrong_keys = np.arange(N).astype(dtype)
|
||||
result = table.lookup(wrong_keys)
|
||||
assert np.all(result == -1)
|
||||
|
||||
def test_lookup_mask(self, table_type, dtype, writable):
|
||||
if table_type == ht.PyObjectHashTable:
|
||||
pytest.skip("Mask not supported for object")
|
||||
N = 3
|
||||
table = table_type(uses_mask=True)
|
||||
keys = (np.arange(N) + N).astype(dtype)
|
||||
mask = np.array([False, True, False])
|
||||
keys.flags.writeable = writable
|
||||
table.map_locations(keys, mask)
|
||||
result = table.lookup(keys, mask)
|
||||
expected = np.arange(N)
|
||||
tm.assert_numpy_array_equal(result.astype(np.int64), expected.astype(np.int64))
|
||||
|
||||
result = table.lookup(np.array([1 + N]).astype(dtype), np.array([False]))
|
||||
tm.assert_numpy_array_equal(
|
||||
result.astype(np.int64), np.array([-1], dtype=np.int64)
|
||||
)
|
||||
|
||||
def test_unique(self, table_type, dtype, writable):
|
||||
if dtype in (np.int8, np.uint8):
|
||||
N = 88
|
||||
else:
|
||||
N = 1000
|
||||
table = table_type()
|
||||
expected = (np.arange(N) + N).astype(dtype)
|
||||
keys = np.repeat(expected, 5)
|
||||
keys.flags.writeable = writable
|
||||
unique = table.unique(keys)
|
||||
tm.assert_numpy_array_equal(unique, expected)
|
||||
|
||||
def test_tracemalloc_works(self, table_type, dtype):
|
||||
if dtype in (np.int8, np.uint8):
|
||||
N = 256
|
||||
else:
|
||||
N = 30000
|
||||
keys = np.arange(N).astype(dtype)
|
||||
with activated_tracemalloc():
|
||||
table = table_type()
|
||||
table.map_locations(keys)
|
||||
used = get_allocated_khash_memory()
|
||||
my_size = table.sizeof()
|
||||
assert used == my_size
|
||||
del table
|
||||
assert get_allocated_khash_memory() == 0
|
||||
|
||||
def test_tracemalloc_for_empty(self, table_type, dtype):
|
||||
with activated_tracemalloc():
|
||||
table = table_type()
|
||||
used = get_allocated_khash_memory()
|
||||
my_size = table.sizeof()
|
||||
assert used == my_size
|
||||
del table
|
||||
assert get_allocated_khash_memory() == 0
|
||||
|
||||
def test_get_state(self, table_type, dtype):
|
||||
table = table_type(1000)
|
||||
state = table.get_state()
|
||||
assert state["size"] == 0
|
||||
assert state["n_occupied"] == 0
|
||||
assert "n_buckets" in state
|
||||
assert "upper_bound" in state
|
||||
|
||||
@pytest.mark.parametrize("N", range(1, 110))
|
||||
def test_no_reallocation(self, table_type, dtype, N):
|
||||
keys = np.arange(N).astype(dtype)
|
||||
preallocated_table = table_type(N)
|
||||
n_buckets_start = preallocated_table.get_state()["n_buckets"]
|
||||
preallocated_table.map_locations(keys)
|
||||
n_buckets_end = preallocated_table.get_state()["n_buckets"]
|
||||
# original number of buckets was enough:
|
||||
assert n_buckets_start == n_buckets_end
|
||||
# check with clean table (not too much preallocated)
|
||||
clean_table = table_type()
|
||||
clean_table.map_locations(keys)
|
||||
assert n_buckets_start == clean_table.get_state()["n_buckets"]
|
||||
|
||||
|
||||
class TestHashTableUnsorted:
|
||||
# TODO: moved from test_algos; may be redundancies with other tests
|
||||
def test_string_hashtable_set_item_signature(self):
|
||||
# GH#30419 fix typing in StringHashTable.set_item to prevent segfault
|
||||
tbl = ht.StringHashTable()
|
||||
|
||||
tbl.set_item("key", 1)
|
||||
assert tbl.get_item("key") == 1
|
||||
|
||||
with pytest.raises(TypeError, match="'key' has incorrect type"):
|
||||
# key arg typed as string, not object
|
||||
tbl.set_item(4, 6)
|
||||
with pytest.raises(TypeError, match="'val' has incorrect type"):
|
||||
tbl.get_item(4)
|
||||
|
||||
def test_lookup_nan(self, writable):
|
||||
# GH#21688 ensure we can deal with readonly memory views
|
||||
xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
|
||||
xs.setflags(write=writable)
|
||||
m = ht.Float64HashTable()
|
||||
m.map_locations(xs)
|
||||
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))
|
||||
|
||||
def test_add_signed_zeros(self):
|
||||
# GH#21866 inconsistent hash-function for float64
|
||||
# default hash-function would lead to different hash-buckets
|
||||
# for 0.0 and -0.0 if there are more than 2^30 hash-buckets
|
||||
# but this would mean 16GB
|
||||
N = 4 # 12 * 10**8 would trigger the error, if you have enough memory
|
||||
m = ht.Float64HashTable(N)
|
||||
m.set_item(0.0, 0)
|
||||
m.set_item(-0.0, 0)
|
||||
assert len(m) == 1 # 0.0 and -0.0 are equivalent
|
||||
|
||||
def test_add_different_nans(self):
|
||||
# GH#21866 inconsistent hash-function for float64
|
||||
# create different nans from bit-patterns:
|
||||
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
|
||||
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
|
||||
assert NAN1 != NAN1
|
||||
assert NAN2 != NAN2
|
||||
# default hash function would lead to different hash-buckets
|
||||
# for NAN1 and NAN2 even if there are only 4 buckets:
|
||||
m = ht.Float64HashTable()
|
||||
m.set_item(NAN1, 0)
|
||||
m.set_item(NAN2, 0)
|
||||
assert len(m) == 1 # NAN1 and NAN2 are equivalent
|
||||
|
||||
def test_lookup_overflow(self, writable):
|
||||
xs = np.array([1, 2, 2**63], dtype=np.uint64)
|
||||
# GH 21688 ensure we can deal with readonly memory views
|
||||
xs.setflags(write=writable)
|
||||
m = ht.UInt64HashTable()
|
||||
m.map_locations(xs)
|
||||
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))
|
||||
|
||||
@pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case
|
||||
@pytest.mark.parametrize(
|
||||
"htable, uniques, dtype, safely_resizes",
|
||||
[
|
||||
(ht.PyObjectHashTable, ht.ObjectVector, "object", False),
|
||||
(ht.StringHashTable, ht.ObjectVector, "object", True),
|
||||
(ht.Float64HashTable, ht.Float64Vector, "float64", False),
|
||||
(ht.Int64HashTable, ht.Int64Vector, "int64", False),
|
||||
(ht.Int32HashTable, ht.Int32Vector, "int32", False),
|
||||
(ht.UInt64HashTable, ht.UInt64Vector, "uint64", False),
|
||||
],
|
||||
)
|
||||
def test_vector_resize(
|
||||
self, writable, htable, uniques, dtype, safely_resizes, nvals
|
||||
):
|
||||
# Test for memory errors after internal vector
|
||||
# reallocations (GH 7157)
|
||||
# Changed from using np.random.default_rng(2).rand to range
|
||||
# which could cause flaky CI failures when safely_resizes=False
|
||||
vals = np.array(range(1000), dtype=dtype)
|
||||
|
||||
# GH 21688 ensures we can deal with read-only memory views
|
||||
vals.setflags(write=writable)
|
||||
|
||||
# initialise instances; cannot initialise in parametrization,
|
||||
# as otherwise external views would be held on the array (which is
|
||||
# one of the things this test is checking)
|
||||
htable = htable()
|
||||
uniques = uniques()
|
||||
|
||||
# get_labels may append to uniques
|
||||
htable.get_labels(vals[:nvals], uniques, 0, -1)
|
||||
# to_array() sets an external_view_exists flag on uniques.
|
||||
tmp = uniques.to_array()
|
||||
oldshape = tmp.shape
|
||||
|
||||
# subsequent get_labels() calls can no longer append to it
|
||||
# (except for StringHashTables + ObjectVector)
|
||||
if safely_resizes:
|
||||
htable.get_labels(vals, uniques, 0, -1)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="external reference.*"):
|
||||
htable.get_labels(vals, uniques, 0, -1)
|
||||
|
||||
uniques.to_array() # should not raise here
|
||||
assert tmp.shape == oldshape
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"hashtable",
|
||||
[
|
||||
ht.PyObjectHashTable,
|
||||
ht.StringHashTable,
|
||||
ht.Float64HashTable,
|
||||
ht.Int64HashTable,
|
||||
ht.Int32HashTable,
|
||||
ht.UInt64HashTable,
|
||||
],
|
||||
)
|
||||
def test_hashtable_large_sizehint(self, hashtable):
|
||||
# GH#22729 smoketest for not raising when passing a large size_hint
|
||||
size_hint = np.iinfo(np.uint32).max + 1
|
||||
hashtable(size_hint=size_hint)
|
||||
|
||||
|
||||
class TestPyObjectHashTableWithNans:
|
||||
def test_nan_float(self):
|
||||
nan1 = float("nan")
|
||||
nan2 = float("nan")
|
||||
assert nan1 is not nan2
|
||||
table = ht.PyObjectHashTable()
|
||||
table.set_item(nan1, 42)
|
||||
assert table.get_item(nan2) == 42
|
||||
|
||||
def test_nan_complex_both(self):
|
||||
nan1 = complex(float("nan"), float("nan"))
|
||||
nan2 = complex(float("nan"), float("nan"))
|
||||
assert nan1 is not nan2
|
||||
table = ht.PyObjectHashTable()
|
||||
table.set_item(nan1, 42)
|
||||
assert table.get_item(nan2) == 42
|
||||
|
||||
def test_nan_complex_real(self):
|
||||
nan1 = complex(float("nan"), 1)
|
||||
nan2 = complex(float("nan"), 1)
|
||||
other = complex(float("nan"), 2)
|
||||
assert nan1 is not nan2
|
||||
table = ht.PyObjectHashTable()
|
||||
table.set_item(nan1, 42)
|
||||
assert table.get_item(nan2) == 42
|
||||
with pytest.raises(KeyError, match=None) as error:
|
||||
table.get_item(other)
|
||||
assert str(error.value) == str(other)
|
||||
|
||||
def test_nan_complex_imag(self):
|
||||
nan1 = complex(1, float("nan"))
|
||||
nan2 = complex(1, float("nan"))
|
||||
other = complex(2, float("nan"))
|
||||
assert nan1 is not nan2
|
||||
table = ht.PyObjectHashTable()
|
||||
table.set_item(nan1, 42)
|
||||
assert table.get_item(nan2) == 42
|
||||
with pytest.raises(KeyError, match=None) as error:
|
||||
table.get_item(other)
|
||||
assert str(error.value) == str(other)
|
||||
|
||||
def test_nan_in_tuple(self):
|
||||
nan1 = (float("nan"),)
|
||||
nan2 = (float("nan"),)
|
||||
assert nan1[0] is not nan2[0]
|
||||
table = ht.PyObjectHashTable()
|
||||
table.set_item(nan1, 42)
|
||||
assert table.get_item(nan2) == 42
|
||||
|
||||
def test_nan_in_nested_tuple(self):
|
||||
nan1 = (1, (2, (float("nan"),)))
|
||||
nan2 = (1, (2, (float("nan"),)))
|
||||
other = (1, 2)
|
||||
table = ht.PyObjectHashTable()
|
||||
table.set_item(nan1, 42)
|
||||
assert table.get_item(nan2) == 42
|
||||
with pytest.raises(KeyError, match=None) as error:
|
||||
table.get_item(other)
|
||||
assert str(error.value) == str(other)
|
||||
|
||||
|
||||
def test_hash_equal_tuple_with_nans():
|
||||
a = (float("nan"), (float("nan"), float("nan")))
|
||||
b = (float("nan"), (float("nan"), float("nan")))
|
||||
assert ht.object_hash(a) == ht.object_hash(b)
|
||||
assert ht.objects_are_equal(a, b)
|
||||
|
||||
|
||||
def test_get_labels_groupby_for_Int64(writable):
|
||||
table = ht.Int64HashTable()
|
||||
vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64)
|
||||
vals.flags.writeable = writable
|
||||
arr, unique = table.get_labels_groupby(vals)
|
||||
expected_arr = np.array([0, 1, -1, 1, 0, -1], dtype=np.intp)
|
||||
expected_unique = np.array([1, 2], dtype=np.int64)
|
||||
tm.assert_numpy_array_equal(arr, expected_arr)
|
||||
tm.assert_numpy_array_equal(unique, expected_unique)
|
||||
|
||||
|
||||
def test_tracemalloc_works_for_StringHashTable():
|
||||
N = 1000
|
||||
keys = np.arange(N).astype(np.str_).astype(np.object_)
|
||||
with activated_tracemalloc():
|
||||
table = ht.StringHashTable()
|
||||
table.map_locations(keys)
|
||||
used = get_allocated_khash_memory()
|
||||
my_size = table.sizeof()
|
||||
assert used == my_size
|
||||
del table
|
||||
assert get_allocated_khash_memory() == 0
|
||||
|
||||
|
||||
def test_tracemalloc_for_empty_StringHashTable():
|
||||
with activated_tracemalloc():
|
||||
table = ht.StringHashTable()
|
||||
used = get_allocated_khash_memory()
|
||||
my_size = table.sizeof()
|
||||
assert used == my_size
|
||||
del table
|
||||
assert get_allocated_khash_memory() == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("N", range(1, 110))
|
||||
def test_no_reallocation_StringHashTable(N):
|
||||
keys = np.arange(N).astype(np.str_).astype(np.object_)
|
||||
preallocated_table = ht.StringHashTable(N)
|
||||
n_buckets_start = preallocated_table.get_state()["n_buckets"]
|
||||
preallocated_table.map_locations(keys)
|
||||
n_buckets_end = preallocated_table.get_state()["n_buckets"]
|
||||
# original number of buckets was enough:
|
||||
assert n_buckets_start == n_buckets_end
|
||||
# check with clean table (not too much preallocated)
|
||||
clean_table = ht.StringHashTable()
|
||||
clean_table.map_locations(keys)
|
||||
assert n_buckets_start == clean_table.get_state()["n_buckets"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"table_type, dtype",
|
||||
[
|
||||
(ht.Float64HashTable, np.float64),
|
||||
(ht.Float32HashTable, np.float32),
|
||||
(ht.Complex128HashTable, np.complex128),
|
||||
(ht.Complex64HashTable, np.complex64),
|
||||
],
|
||||
)
|
||||
class TestHashTableWithNans:
|
||||
def test_get_set_contains_len(self, table_type, dtype):
|
||||
index = float("nan")
|
||||
table = table_type()
|
||||
assert index not in table
|
||||
|
||||
table.set_item(index, 42)
|
||||
assert len(table) == 1
|
||||
assert index in table
|
||||
assert table.get_item(index) == 42
|
||||
|
||||
table.set_item(index, 41)
|
||||
assert len(table) == 1
|
||||
assert index in table
|
||||
assert table.get_item(index) == 41
|
||||
|
||||
def test_map_locations(self, table_type, dtype):
|
||||
N = 10
|
||||
table = table_type()
|
||||
keys = np.full(N, np.nan, dtype=dtype)
|
||||
table.map_locations(keys)
|
||||
assert len(table) == 1
|
||||
assert table.get_item(np.nan) == N - 1
|
||||
|
||||
def test_unique(self, table_type, dtype):
|
||||
N = 1020
|
||||
table = table_type()
|
||||
keys = np.full(N, np.nan, dtype=dtype)
|
||||
unique = table.unique(keys)
|
||||
assert np.all(np.isnan(unique)) and len(unique) == 1
|
||||
|
||||
|
||||
def test_unique_for_nan_objects_floats():
|
||||
table = ht.PyObjectHashTable()
|
||||
keys = np.array([float("nan") for i in range(50)], dtype=np.object_)
|
||||
unique = table.unique(keys)
|
||||
assert len(unique) == 1
|
||||
|
||||
|
||||
def test_unique_for_nan_objects_complex():
|
||||
table = ht.PyObjectHashTable()
|
||||
keys = np.array([complex(float("nan"), 1.0) for i in range(50)], dtype=np.object_)
|
||||
unique = table.unique(keys)
|
||||
assert len(unique) == 1
|
||||
|
||||
|
||||
def test_unique_for_nan_objects_tuple():
|
||||
table = ht.PyObjectHashTable()
|
||||
keys = np.array(
|
||||
[1] + [(1.0, (float("nan"), 1.0)) for i in range(50)], dtype=np.object_
|
||||
)
|
||||
unique = table.unique(keys)
|
||||
assert len(unique) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
np.object_,
|
||||
np.complex128,
|
||||
np.int64,
|
||||
np.uint64,
|
||||
np.float64,
|
||||
np.complex64,
|
||||
np.int32,
|
||||
np.uint32,
|
||||
np.float32,
|
||||
np.int16,
|
||||
np.uint16,
|
||||
np.int8,
|
||||
np.uint8,
|
||||
np.intp,
|
||||
],
|
||||
)
|
||||
class TestHelpFunctions:
|
||||
def test_value_count(self, dtype, writable):
|
||||
N = 43
|
||||
expected = (np.arange(N) + N).astype(dtype)
|
||||
values = np.repeat(expected, 5)
|
||||
values.flags.writeable = writable
|
||||
keys, counts, _ = ht.value_count(values, False)
|
||||
tm.assert_numpy_array_equal(np.sort(keys), expected)
|
||||
assert np.all(counts == 5)
|
||||
|
||||
def test_value_count_mask(self, dtype):
|
||||
if dtype == np.object_:
|
||||
pytest.skip("mask not implemented for object dtype")
|
||||
values = np.array([1] * 5, dtype=dtype)
|
||||
mask = np.zeros((5,), dtype=np.bool_)
|
||||
mask[1] = True
|
||||
mask[4] = True
|
||||
keys, counts, na_counter = ht.value_count(values, False, mask=mask)
|
||||
assert len(keys) == 2
|
||||
assert na_counter == 2
|
||||
|
||||
def test_value_count_stable(self, dtype, writable):
|
||||
# GH12679
|
||||
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
|
||||
values.flags.writeable = writable
|
||||
keys, counts, _ = ht.value_count(values, False)
|
||||
tm.assert_numpy_array_equal(keys, values)
|
||||
assert np.all(counts == 1)
|
||||
|
||||
def test_duplicated_first(self, dtype, writable):
|
||||
N = 100
|
||||
values = np.repeat(np.arange(N).astype(dtype), 5)
|
||||
values.flags.writeable = writable
|
||||
result = ht.duplicated(values)
|
||||
expected = np.ones_like(values, dtype=np.bool_)
|
||||
expected[::5] = False
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_ismember_yes(self, dtype, writable):
|
||||
N = 127
|
||||
arr = np.arange(N).astype(dtype)
|
||||
values = np.arange(N).astype(dtype)
|
||||
arr.flags.writeable = writable
|
||||
values.flags.writeable = writable
|
||||
result = ht.ismember(arr, values)
|
||||
expected = np.ones_like(values, dtype=np.bool_)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_ismember_no(self, dtype):
|
||||
N = 17
|
||||
arr = np.arange(N).astype(dtype)
|
||||
values = (np.arange(N) + N).astype(dtype)
|
||||
result = ht.ismember(arr, values)
|
||||
expected = np.zeros_like(values, dtype=np.bool_)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_mode(self, dtype, writable):
|
||||
if dtype in (np.int8, np.uint8):
|
||||
N = 53
|
||||
else:
|
||||
N = 11111
|
||||
values = np.repeat(np.arange(N).astype(dtype), 5)
|
||||
values[0] = 42
|
||||
values.flags.writeable = writable
|
||||
result = ht.mode(values, False)[0]
|
||||
assert result == 42
|
||||
|
||||
def test_mode_stable(self, dtype, writable):
|
||||
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
|
||||
values.flags.writeable = writable
|
||||
keys = ht.mode(values, False)[0]
|
||||
tm.assert_numpy_array_equal(keys, values)
|
||||
|
||||
|
||||
def test_modes_with_nans():
|
||||
# GH42688, nans aren't mangled
|
||||
nulls = [pd.NA, np.nan, pd.NaT, None]
|
||||
values = np.array([True] + nulls * 2, dtype=np.object_)
|
||||
modes = ht.mode(values, False)[0]
|
||||
assert modes.size == len(nulls)
|
||||
|
||||
|
||||
def test_unique_label_indices_intp(writable):
|
||||
keys = np.array([1, 2, 2, 2, 1, 3], dtype=np.intp)
|
||||
keys.flags.writeable = writable
|
||||
result = ht.unique_label_indices(keys)
|
||||
expected = np.array([0, 1, 5], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_unique_label_indices():
|
||||
a = np.random.default_rng(2).integers(1, 1 << 10, 1 << 15).astype(np.intp)
|
||||
|
||||
left = ht.unique_label_indices(a)
|
||||
right = np.unique(a, return_index=True)[1]
|
||||
|
||||
tm.assert_numpy_array_equal(left, right, check_dtype=False)
|
||||
|
||||
a[np.random.default_rng(2).choice(len(a), 10)] = -1
|
||||
left = ht.unique_label_indices(a)
|
||||
right = np.unique(a, return_index=True)[1][1:]
|
||||
tm.assert_numpy_array_equal(left, right, check_dtype=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
np.float64,
|
||||
np.float32,
|
||||
np.complex128,
|
||||
np.complex64,
|
||||
],
|
||||
)
|
||||
class TestHelpFunctionsWithNans:
|
||||
def test_value_count(self, dtype):
|
||||
values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
|
||||
keys, counts, _ = ht.value_count(values, True)
|
||||
assert len(keys) == 0
|
||||
keys, counts, _ = ht.value_count(values, False)
|
||||
assert len(keys) == 1 and np.all(np.isnan(keys))
|
||||
assert counts[0] == 3
|
||||
|
||||
def test_duplicated_first(self, dtype):
|
||||
values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
|
||||
result = ht.duplicated(values)
|
||||
expected = np.array([False, True, True])
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_ismember_yes(self, dtype):
|
||||
arr = np.array([np.nan, np.nan, np.nan], dtype=dtype)
|
||||
values = np.array([np.nan, np.nan], dtype=dtype)
|
||||
result = ht.ismember(arr, values)
|
||||
expected = np.array([True, True, True], dtype=np.bool_)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_ismember_no(self, dtype):
|
||||
arr = np.array([np.nan, np.nan, np.nan], dtype=dtype)
|
||||
values = np.array([1], dtype=dtype)
|
||||
result = ht.ismember(arr, values)
|
||||
expected = np.array([False, False, False], dtype=np.bool_)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_mode(self, dtype):
|
||||
values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype)
|
||||
assert ht.mode(values, True)[0] == 42
|
||||
assert np.isnan(ht.mode(values, False)[0])
|
||||
|
||||
|
||||
def test_ismember_tuple_with_nans():
|
||||
# GH-41836
|
||||
values = [("a", float("nan")), ("b", 1)]
|
||||
comps = [("a", float("nan"))]
|
||||
|
||||
msg = "isin with argument that is not not a Series"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = isin(values, comps)
|
||||
expected = np.array([True, False], dtype=np.bool_)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_float_complex_int_are_equal_as_objects():
|
||||
values = ["a", 5, 5.0, 5.0 + 0j]
|
||||
comps = list(range(129))
|
||||
result = isin(np.array(values, dtype=object), np.asarray(comps))
|
||||
expected = np.array([False, True, True, True], dtype=np.bool_)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
390
venv/lib/python3.13/site-packages/pandas/tests/libs/test_join.py
Normal file
390
venv/lib/python3.13/site-packages/pandas/tests/libs/test_join.py
Normal file
|
|
@ -0,0 +1,390 @@
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import join as libjoin
|
||||
from pandas._libs.join import (
|
||||
inner_join,
|
||||
left_outer_join,
|
||||
)
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestIndexer:
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["int32", "int64", "float32", "float64", "object"]
|
||||
)
|
||||
def test_outer_join_indexer(self, dtype):
|
||||
indexer = libjoin.outer_join_indexer
|
||||
|
||||
left = np.arange(3, dtype=dtype)
|
||||
right = np.arange(2, 5, dtype=dtype)
|
||||
empty = np.array([], dtype=dtype)
|
||||
|
||||
result, lindexer, rindexer = indexer(left, right)
|
||||
assert isinstance(result, np.ndarray)
|
||||
assert isinstance(lindexer, np.ndarray)
|
||||
assert isinstance(rindexer, np.ndarray)
|
||||
tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype))
|
||||
exp = np.array([0, 1, 2, -1, -1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(lindexer, exp)
|
||||
exp = np.array([-1, -1, 0, 1, 2], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(rindexer, exp)
|
||||
|
||||
result, lindexer, rindexer = indexer(empty, right)
|
||||
tm.assert_numpy_array_equal(result, right)
|
||||
exp = np.array([-1, -1, -1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(lindexer, exp)
|
||||
exp = np.array([0, 1, 2], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(rindexer, exp)
|
||||
|
||||
result, lindexer, rindexer = indexer(left, empty)
|
||||
tm.assert_numpy_array_equal(result, left)
|
||||
exp = np.array([0, 1, 2], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(lindexer, exp)
|
||||
exp = np.array([-1, -1, -1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(rindexer, exp)
|
||||
|
||||
def test_cython_left_outer_join(self):
|
||||
left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
|
||||
right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
|
||||
max_group = 5
|
||||
|
||||
ls, rs = left_outer_join(left, right, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind="mergesort")
|
||||
exp_rs = right.argsort(kind="mergesort")
|
||||
|
||||
exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
|
||||
exp_ri = np.array(
|
||||
[0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]
|
||||
)
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
|
||||
|
||||
def test_cython_right_outer_join(self):
|
||||
left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
|
||||
right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
|
||||
max_group = 5
|
||||
|
||||
rs, ls = left_outer_join(right, left, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind="mergesort")
|
||||
exp_rs = right.argsort(kind="mergesort")
|
||||
|
||||
# 0 1 1 1
|
||||
exp_li = np.array(
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
# 2 2 4
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
-1,
|
||||
]
|
||||
)
|
||||
exp_ri = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs)
|
||||
|
||||
def test_cython_inner_join(self):
|
||||
left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
|
||||
right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.intp)
|
||||
max_group = 5
|
||||
|
||||
ls, rs = inner_join(left, right, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind="mergesort")
|
||||
exp_rs = right.argsort(kind="mergesort")
|
||||
|
||||
exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
|
||||
exp_ri = np.array([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("readonly", [True, False])
|
||||
def test_left_join_indexer_unique(readonly):
|
||||
a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
|
||||
b = np.array([2, 2, 3, 4, 4], dtype=np.int64)
|
||||
if readonly:
|
||||
# GH#37312, GH#37264
|
||||
a.setflags(write=False)
|
||||
b.setflags(write=False)
|
||||
|
||||
result = libjoin.left_join_indexer_unique(b, a)
|
||||
expected = np.array([1, 1, 2, 3, 3], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_left_outer_join_bug():
|
||||
left = np.array(
|
||||
[
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
1,
|
||||
0,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
2,
|
||||
3,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
3,
|
||||
0,
|
||||
3,
|
||||
2,
|
||||
3,
|
||||
0,
|
||||
0,
|
||||
2,
|
||||
3,
|
||||
2,
|
||||
0,
|
||||
3,
|
||||
1,
|
||||
3,
|
||||
0,
|
||||
1,
|
||||
3,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
3,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
0,
|
||||
3,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
0,
|
||||
3,
|
||||
1,
|
||||
3,
|
||||
2,
|
||||
2,
|
||||
0,
|
||||
1,
|
||||
3,
|
||||
0,
|
||||
2,
|
||||
3,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
1,
|
||||
3,
|
||||
2,
|
||||
0,
|
||||
0,
|
||||
3,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
3,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
2,
|
||||
],
|
||||
dtype=np.intp,
|
||||
)
|
||||
|
||||
right = np.array([3, 1], dtype=np.intp)
|
||||
max_groups = 4
|
||||
|
||||
lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False)
|
||||
|
||||
exp_lidx = np.arange(len(left), dtype=np.intp)
|
||||
exp_ridx = -np.ones(len(left), dtype=np.intp)
|
||||
|
||||
exp_ridx[left == 1] = 1
|
||||
exp_ridx[left == 3] = 0
|
||||
|
||||
tm.assert_numpy_array_equal(lidx, exp_lidx)
|
||||
tm.assert_numpy_array_equal(ridx, exp_ridx)
|
||||
|
||||
|
||||
def test_inner_join_indexer():
|
||||
a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
|
||||
b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
|
||||
|
||||
index, ares, bres = libjoin.inner_join_indexer(a, b)
|
||||
|
||||
index_exp = np.array([3, 5], dtype=np.int64)
|
||||
tm.assert_almost_equal(index, index_exp)
|
||||
|
||||
aexp = np.array([2, 4], dtype=np.intp)
|
||||
bexp = np.array([1, 2], dtype=np.intp)
|
||||
tm.assert_almost_equal(ares, aexp)
|
||||
tm.assert_almost_equal(bres, bexp)
|
||||
|
||||
a = np.array([5], dtype=np.int64)
|
||||
b = np.array([5], dtype=np.int64)
|
||||
|
||||
index, ares, bres = libjoin.inner_join_indexer(a, b)
|
||||
tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
|
||||
tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
|
||||
tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
|
||||
|
||||
|
||||
def test_outer_join_indexer():
|
||||
a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
|
||||
b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
|
||||
|
||||
index, ares, bres = libjoin.outer_join_indexer(a, b)
|
||||
|
||||
index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64)
|
||||
tm.assert_almost_equal(index, index_exp)
|
||||
|
||||
aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.intp)
|
||||
bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp)
|
||||
tm.assert_almost_equal(ares, aexp)
|
||||
tm.assert_almost_equal(bres, bexp)
|
||||
|
||||
a = np.array([5], dtype=np.int64)
|
||||
b = np.array([5], dtype=np.int64)
|
||||
|
||||
index, ares, bres = libjoin.outer_join_indexer(a, b)
|
||||
tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
|
||||
tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
|
||||
tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
|
||||
|
||||
|
||||
def test_left_join_indexer():
|
||||
a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
|
||||
b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
|
||||
|
||||
index, ares, bres = libjoin.left_join_indexer(a, b)
|
||||
|
||||
tm.assert_almost_equal(index, a)
|
||||
|
||||
aexp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
|
||||
bexp = np.array([-1, -1, 1, -1, 2], dtype=np.intp)
|
||||
tm.assert_almost_equal(ares, aexp)
|
||||
tm.assert_almost_equal(bres, bexp)
|
||||
|
||||
a = np.array([5], dtype=np.int64)
|
||||
b = np.array([5], dtype=np.int64)
|
||||
|
||||
index, ares, bres = libjoin.left_join_indexer(a, b)
|
||||
tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
|
||||
tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
|
||||
tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
|
||||
|
||||
|
||||
def test_left_join_indexer2():
|
||||
idx = np.array([1, 1, 2, 5], dtype=np.int64)
|
||||
idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
|
||||
|
||||
res, lidx, ridx = libjoin.left_join_indexer(idx2, idx)
|
||||
|
||||
exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
|
||||
tm.assert_almost_equal(res, exp_res)
|
||||
|
||||
exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
|
||||
tm.assert_almost_equal(lidx, exp_lidx)
|
||||
|
||||
exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
|
||||
tm.assert_almost_equal(ridx, exp_ridx)
|
||||
|
||||
|
||||
def test_outer_join_indexer2():
|
||||
idx = np.array([1, 1, 2, 5], dtype=np.int64)
|
||||
idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
|
||||
|
||||
res, lidx, ridx = libjoin.outer_join_indexer(idx2, idx)
|
||||
|
||||
exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
|
||||
tm.assert_almost_equal(res, exp_res)
|
||||
|
||||
exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
|
||||
tm.assert_almost_equal(lidx, exp_lidx)
|
||||
|
||||
exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
|
||||
tm.assert_almost_equal(ridx, exp_ridx)
|
||||
|
||||
|
||||
def test_inner_join_indexer2():
|
||||
idx = np.array([1, 1, 2, 5], dtype=np.int64)
|
||||
idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
|
||||
|
||||
res, lidx, ridx = libjoin.inner_join_indexer(idx2, idx)
|
||||
|
||||
exp_res = np.array([1, 1, 2, 5], dtype=np.int64)
|
||||
tm.assert_almost_equal(res, exp_res)
|
||||
|
||||
exp_lidx = np.array([0, 0, 1, 2], dtype=np.intp)
|
||||
tm.assert_almost_equal(lidx, exp_lidx)
|
||||
|
||||
exp_ridx = np.array([0, 1, 2, 3], dtype=np.intp)
|
||||
tm.assert_almost_equal(ridx, exp_ridx)
|
||||
299
venv/lib/python3.13/site-packages/pandas/tests/libs/test_lib.py
Normal file
299
venv/lib/python3.13/site-packages/pandas/tests/libs/test_lib.py
Normal file
|
|
@ -0,0 +1,299 @@
|
|||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import (
|
||||
Timedelta,
|
||||
lib,
|
||||
writers as libwriters,
|
||||
)
|
||||
from pandas.compat import IS64
|
||||
|
||||
from pandas import Index
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestMisc:
|
||||
def test_max_len_string_array(self):
|
||||
arr = a = np.array(["foo", "b", np.nan], dtype="object")
|
||||
assert libwriters.max_len_string_array(arr) == 3
|
||||
|
||||
# unicode
|
||||
arr = a.astype("U").astype(object)
|
||||
assert libwriters.max_len_string_array(arr) == 3
|
||||
|
||||
# bytes for python3
|
||||
arr = a.astype("S").astype(object)
|
||||
assert libwriters.max_len_string_array(arr) == 3
|
||||
|
||||
# raises
|
||||
msg = "No matching signature found"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
libwriters.max_len_string_array(arr.astype("U"))
|
||||
|
||||
def test_fast_unique_multiple_list_gen_sort(self):
|
||||
keys = [["p", "a"], ["n", "d"], ["a", "s"]]
|
||||
|
||||
gen = (key for key in keys)
|
||||
expected = np.array(["a", "d", "n", "p", "s"])
|
||||
out = lib.fast_unique_multiple_list_gen(gen, sort=True)
|
||||
tm.assert_numpy_array_equal(np.array(out), expected)
|
||||
|
||||
gen = (key for key in keys)
|
||||
expected = np.array(["p", "a", "n", "d", "s"])
|
||||
out = lib.fast_unique_multiple_list_gen(gen, sort=False)
|
||||
tm.assert_numpy_array_equal(np.array(out), expected)
|
||||
|
||||
def test_fast_multiget_timedelta_resos(self):
|
||||
# This will become relevant for test_constructor_dict_timedelta64_index
|
||||
# once Timedelta constructor preserves reso when passed a
|
||||
# np.timedelta64 object
|
||||
td = Timedelta(days=1)
|
||||
|
||||
mapping1 = {td: 1}
|
||||
mapping2 = {td.as_unit("s"): 1}
|
||||
|
||||
oindex = Index([td * n for n in range(3)])._values.astype(object)
|
||||
|
||||
expected = lib.fast_multiget(mapping1, oindex)
|
||||
result = lib.fast_multiget(mapping2, oindex)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# case that can't be cast to td64ns
|
||||
td = Timedelta(np.timedelta64(146000, "D"))
|
||||
assert hash(td) == hash(td.as_unit("ms"))
|
||||
assert hash(td) == hash(td.as_unit("us"))
|
||||
mapping1 = {td: 1}
|
||||
mapping2 = {td.as_unit("ms"): 1}
|
||||
|
||||
oindex = Index([td * n for n in range(3)])._values.astype(object)
|
||||
|
||||
expected = lib.fast_multiget(mapping1, oindex)
|
||||
result = lib.fast_multiget(mapping2, oindex)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
class TestIndexing:
|
||||
def test_maybe_indices_to_slice_left_edge(self):
|
||||
target = np.arange(100)
|
||||
|
||||
# slice
|
||||
indices = np.array([], dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize("end", [1, 2, 5, 20, 99])
|
||||
@pytest.mark.parametrize("step", [1, 2, 4])
|
||||
def test_maybe_indices_to_slice_left_edge_not_slice_end_steps(self, end, step):
|
||||
target = np.arange(100)
|
||||
indices = np.arange(0, end, step, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
# reverse
|
||||
indices = indices[::-1]
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case", [[2, 1, 2, 0], [2, 2, 1, 0], [0, 1, 2, 1], [-2, 0, 2], [2, 0, -2]]
|
||||
)
|
||||
def test_maybe_indices_to_slice_left_edge_not_slice(self, case):
|
||||
# not slice
|
||||
target = np.arange(100)
|
||||
indices = np.array(case, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert not isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(maybe_slice, indices)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize("start", [0, 2, 5, 20, 97, 98])
|
||||
@pytest.mark.parametrize("step", [1, 2, 4])
|
||||
def test_maybe_indices_to_slice_right_edge(self, start, step):
|
||||
target = np.arange(100)
|
||||
|
||||
# slice
|
||||
indices = np.arange(start, 99, step, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
# reverse
|
||||
indices = indices[::-1]
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
def test_maybe_indices_to_slice_right_edge_not_slice(self):
|
||||
# not slice
|
||||
target = np.arange(100)
|
||||
indices = np.array([97, 98, 99, 100], dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert not isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(maybe_slice, indices)
|
||||
|
||||
msg = "index 100 is out of bounds for axis (0|1) with size 100"
|
||||
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
target[indices]
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
target[maybe_slice]
|
||||
|
||||
indices = np.array([100, 99, 98, 97], dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert not isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(maybe_slice, indices)
|
||||
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
target[indices]
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
target[maybe_slice]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case", [[99, 97, 99, 96], [99, 99, 98, 97], [98, 98, 97, 96]]
|
||||
)
|
||||
def test_maybe_indices_to_slice_right_edge_cases(self, case):
|
||||
target = np.arange(100)
|
||||
indices = np.array(case, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert not isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(maybe_slice, indices)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize("step", [1, 2, 4, 5, 8, 9])
|
||||
def test_maybe_indices_to_slice_both_edges(self, step):
|
||||
target = np.arange(10)
|
||||
|
||||
# slice
|
||||
indices = np.arange(0, 9, step, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
# reverse
|
||||
indices = indices[::-1]
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize("case", [[4, 2, 0, -2], [2, 2, 1, 0], [0, 1, 2, 1]])
|
||||
def test_maybe_indices_to_slice_both_edges_not_slice(self, case):
|
||||
# not slice
|
||||
target = np.arange(10)
|
||||
indices = np.array(case, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
assert not isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(maybe_slice, indices)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize("start, end", [(2, 10), (5, 25), (65, 97)])
|
||||
@pytest.mark.parametrize("step", [1, 2, 4, 20])
|
||||
def test_maybe_indices_to_slice_middle(self, start, end, step):
|
||||
target = np.arange(100)
|
||||
|
||||
# slice
|
||||
indices = np.arange(start, end, step, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
# reverse
|
||||
indices = indices[::-1]
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case", [[14, 12, 10, 12], [12, 12, 11, 10], [10, 11, 12, 11]]
|
||||
)
|
||||
def test_maybe_indices_to_slice_middle_not_slice(self, case):
|
||||
# not slice
|
||||
target = np.arange(100)
|
||||
indices = np.array(case, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert not isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(maybe_slice, indices)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
def test_maybe_booleans_to_slice(self):
|
||||
arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8)
|
||||
result = lib.maybe_booleans_to_slice(arr)
|
||||
assert result.dtype == np.bool_
|
||||
|
||||
result = lib.maybe_booleans_to_slice(arr[:0])
|
||||
assert result == slice(0, 0)
|
||||
|
||||
def test_get_reverse_indexer(self):
|
||||
indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.intp)
|
||||
result = lib.get_reverse_indexer(indexer, 5)
|
||||
expected = np.array([4, 2, 3, 6, 7], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["int64", "int32"])
|
||||
def test_is_range_indexer(self, dtype):
|
||||
# GH#50592
|
||||
left = np.arange(0, 100, dtype=dtype)
|
||||
assert lib.is_range_indexer(left, 100)
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not IS64,
|
||||
reason="2**31 is too big for Py_ssize_t on 32-bit. "
|
||||
"It doesn't matter though since you cannot create an array that long on 32-bit",
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["int64", "int32"])
|
||||
def test_is_range_indexer_big_n(self, dtype):
|
||||
# GH53616
|
||||
left = np.arange(0, 100, dtype=dtype)
|
||||
|
||||
assert not lib.is_range_indexer(left, 2**31)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["int64", "int32"])
|
||||
def test_is_range_indexer_not_equal(self, dtype):
|
||||
# GH#50592
|
||||
left = np.array([1, 2], dtype=dtype)
|
||||
assert not lib.is_range_indexer(left, 2)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["int64", "int32"])
|
||||
def test_is_range_indexer_not_equal_shape(self, dtype):
|
||||
# GH#50592
|
||||
left = np.array([0, 1, 2], dtype=dtype)
|
||||
assert not lib.is_range_indexer(left, 2)
|
||||
|
||||
|
||||
def test_cache_readonly_preserve_docstrings():
|
||||
# GH18197
|
||||
assert Index.hasnans.__doc__ is not None
|
||||
|
||||
|
||||
def test_no_default_pickle():
|
||||
# GH#40397
|
||||
obj = tm.round_trip_pickle(lib.no_default)
|
||||
assert obj is lib.no_default
|
||||
|
||||
|
||||
def test_ensure_string_array_copy():
|
||||
# ensure the original array is not modified in case of copy=False with
|
||||
# pickle-roundtripped object dtype array
|
||||
# https://github.com/pandas-dev/pandas/issues/54654
|
||||
arr = np.array(["a", None], dtype=object)
|
||||
arr = pickle.loads(pickle.dumps(arr))
|
||||
result = lib.ensure_string_array(arr, copy=False)
|
||||
assert not np.shares_memory(arr, result)
|
||||
assert arr[1] is None
|
||||
assert result[1] is np.nan
|
||||
|
|
@ -0,0 +1,162 @@
|
|||
from datetime import datetime
|
||||
from itertools import permutations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import algos as libalgos
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_ensure_platform_int():
|
||||
arr = np.arange(100, dtype=np.intp)
|
||||
|
||||
result = libalgos.ensure_platform_int(arr)
|
||||
assert result is arr
|
||||
|
||||
|
||||
def test_is_lexsorted():
|
||||
failure = [
|
||||
np.array(
|
||||
([3] * 32) + ([2] * 32) + ([1] * 32) + ([0] * 32),
|
||||
dtype="int64",
|
||||
),
|
||||
np.array(
|
||||
list(range(31))[::-1] * 4,
|
||||
dtype="int64",
|
||||
),
|
||||
]
|
||||
|
||||
assert not libalgos.is_lexsorted(failure)
|
||||
|
||||
|
||||
def test_groupsort_indexer():
|
||||
a = np.random.default_rng(2).integers(0, 1000, 100).astype(np.intp)
|
||||
b = np.random.default_rng(2).integers(0, 1000, 100).astype(np.intp)
|
||||
|
||||
result = libalgos.groupsort_indexer(a, 1000)[0]
|
||||
|
||||
# need to use a stable sort
|
||||
# np.argsort returns int, groupsort_indexer
|
||||
# always returns intp
|
||||
expected = np.argsort(a, kind="mergesort")
|
||||
expected = expected.astype(np.intp)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# compare with lexsort
|
||||
# np.lexsort returns int, groupsort_indexer
|
||||
# always returns intp
|
||||
key = a * 1000 + b
|
||||
result = libalgos.groupsort_indexer(key, 1000000)[0]
|
||||
expected = np.lexsort((b, a))
|
||||
expected = expected.astype(np.intp)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
class TestPadBackfill:
|
||||
def test_backfill(self):
|
||||
old = np.array([1, 5, 10], dtype=np.int64)
|
||||
new = np.array(list(range(12)), dtype=np.int64)
|
||||
|
||||
filler = libalgos.backfill["int64_t"](old, new)
|
||||
|
||||
expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(filler, expect_filler)
|
||||
|
||||
# corner case
|
||||
old = np.array([1, 4], dtype=np.int64)
|
||||
new = np.array(list(range(5, 10)), dtype=np.int64)
|
||||
filler = libalgos.backfill["int64_t"](old, new)
|
||||
|
||||
expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(filler, expect_filler)
|
||||
|
||||
def test_pad(self):
|
||||
old = np.array([1, 5, 10], dtype=np.int64)
|
||||
new = np.array(list(range(12)), dtype=np.int64)
|
||||
|
||||
filler = libalgos.pad["int64_t"](old, new)
|
||||
|
||||
expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(filler, expect_filler)
|
||||
|
||||
# corner case
|
||||
old = np.array([5, 10], dtype=np.int64)
|
||||
new = np.arange(5, dtype=np.int64)
|
||||
filler = libalgos.pad["int64_t"](old, new)
|
||||
expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(filler, expect_filler)
|
||||
|
||||
def test_pad_backfill_object_segfault(self):
|
||||
old = np.array([], dtype="O")
|
||||
new = np.array([datetime(2010, 12, 31)], dtype="O")
|
||||
|
||||
result = libalgos.pad["object"](old, new)
|
||||
expected = np.array([-1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = libalgos.pad["object"](new, old)
|
||||
expected = np.array([], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = libalgos.backfill["object"](old, new)
|
||||
expected = np.array([-1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = libalgos.backfill["object"](new, old)
|
||||
expected = np.array([], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
class TestInfinity:
|
||||
def test_infinity_sort(self):
|
||||
# GH#13445
|
||||
# numpy's argsort can be unhappy if something is less than
|
||||
# itself. Instead, let's give our infinities a self-consistent
|
||||
# ordering, but outside the float extended real line.
|
||||
|
||||
Inf = libalgos.Infinity()
|
||||
NegInf = libalgos.NegInfinity()
|
||||
|
||||
ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf]
|
||||
|
||||
assert all(Inf >= x for x in ref_nums)
|
||||
assert all(Inf > x or x is Inf for x in ref_nums)
|
||||
assert Inf >= Inf and Inf == Inf
|
||||
assert not Inf < Inf and not Inf > Inf
|
||||
assert libalgos.Infinity() == libalgos.Infinity()
|
||||
assert not libalgos.Infinity() != libalgos.Infinity()
|
||||
|
||||
assert all(NegInf <= x for x in ref_nums)
|
||||
assert all(NegInf < x or x is NegInf for x in ref_nums)
|
||||
assert NegInf <= NegInf and NegInf == NegInf
|
||||
assert not NegInf < NegInf and not NegInf > NegInf
|
||||
assert libalgos.NegInfinity() == libalgos.NegInfinity()
|
||||
assert not libalgos.NegInfinity() != libalgos.NegInfinity()
|
||||
|
||||
for perm in permutations(ref_nums):
|
||||
assert sorted(perm) == ref_nums
|
||||
|
||||
# smoke tests
|
||||
np.array([libalgos.Infinity()] * 32).argsort()
|
||||
np.array([libalgos.NegInfinity()] * 32).argsort()
|
||||
|
||||
def test_infinity_against_nan(self):
|
||||
Inf = libalgos.Infinity()
|
||||
NegInf = libalgos.NegInfinity()
|
||||
|
||||
assert not Inf > np.nan
|
||||
assert not Inf >= np.nan
|
||||
assert not Inf < np.nan
|
||||
assert not Inf <= np.nan
|
||||
assert not Inf == np.nan
|
||||
assert Inf != np.nan
|
||||
|
||||
assert not NegInf > np.nan
|
||||
assert not NegInf >= np.nan
|
||||
assert not NegInf < np.nan
|
||||
assert not NegInf <= np.nan
|
||||
assert not NegInf == np.nan
|
||||
assert NegInf != np.nan
|
||||
Loading…
Add table
Add a link
Reference in a new issue