2
2
import re
3
3
from typing import (
4
4
Any ,
5
+ Dict ,
6
+ List ,
5
7
Optional ,
6
8
Tuple ,
7
9
Union ,
22
24
Endianness ,
23
25
)
24
26
25
- _NP_DTYPES = {
27
+ _NP_DTYPES : Dict [ DtypeKind , Dict [ int , Any ]] = {
26
28
DtypeKind .INT : {8 : np .int8 , 16 : np .int16 , 32 : np .int32 , 64 : np .int64 },
27
29
DtypeKind .UINT : {8 : np .uint8 , 16 : np .uint16 , 32 : np .uint32 , 64 : np .uint64 },
28
30
DtypeKind .FLOAT : {32 : np .float32 , 64 : np .float64 },
@@ -90,7 +92,7 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
90
92
"""
91
93
# We need a dict of columns here, with each column being a NumPy array (at
92
94
# least for now, deal with non-NumPy dtypes later).
93
- columns = {}
95
+ columns : Dict [ str , Any ] = {}
94
96
buffers = [] # hold on to buffers, keeps memory alive
95
97
for name in df .column_names ():
96
98
if not isinstance (name , str ):
@@ -161,12 +163,14 @@ def categorical_column_to_series(col: Column) -> Tuple[pd.Series, Any]:
161
163
Tuple of pd.Series holding the data and the memory owner object
162
164
that keeps the memory alive.
163
165
"""
164
- ordered , is_dict , mapping = col .describe_categorical . values ()
166
+ categorical = col .describe_categorical
165
167
166
- if not is_dict :
168
+ if not categorical [ "is_dictionary" ] :
167
169
raise NotImplementedError ("Non-dictionary categoricals not supported yet" )
168
170
169
- categories = np .array (tuple (mapping .values ()))
171
+ mapping = categorical ["mapping" ]
172
+ assert isinstance (mapping , dict ), "Categorical mapping must be a dict"
173
+ categories = np .array (tuple (mapping [k ] for k in sorted (mapping )))
170
174
buffers = col .get_buffers ()
171
175
172
176
codes_buff , codes_dtype = buffers ["data" ]
@@ -176,7 +180,9 @@ def categorical_column_to_series(col: Column) -> Tuple[pd.Series, Any]:
176
180
# out-of-bounds sentinel values in `codes`
177
181
values = categories [codes % len (categories )]
178
182
179
- cat = pd .Categorical (values , categories = categories , ordered = ordered )
183
+ cat = pd .Categorical (
184
+ values , categories = categories , ordered = categorical ["is_ordered" ]
185
+ )
180
186
data = pd .Series (cat )
181
187
182
188
data = set_nulls (data , col , buffers ["validity" ])
@@ -210,6 +216,7 @@ def string_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]:
210
216
211
217
buffers = col .get_buffers ()
212
218
219
+ assert buffers ["offsets" ], "String buffers must contain offsets"
213
220
# Retrieve the data buffer containing the UTF-8 code units
214
221
data_buff , protocol_data_dtype = buffers ["data" ]
215
222
# We're going to reinterpret the buffer as uint8, so make sure we can do it safely
@@ -238,13 +245,14 @@ def string_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]:
238
245
239
246
null_pos = None
240
247
if null_kind in (ColumnNullType .USE_BITMASK , ColumnNullType .USE_BYTEMASK ):
248
+ assert buffers ["validity" ], "Validity buffers cannot be empty for masks"
241
249
valid_buff , valid_dtype = buffers ["validity" ]
242
250
null_pos = buffer_to_ndarray (valid_buff , valid_dtype , col .offset , col .size )
243
251
if sentinel_val == 0 :
244
252
null_pos = ~ null_pos
245
253
246
254
# Assemble the strings from the code units
247
- str_list = [None ] * col .size
255
+ str_list : List [ Union [ None , float , str ]] = [None ] * col .size
248
256
for i in range (col .size ):
249
257
# Check for missing values
250
258
if null_pos is not None and null_pos [i ]:
@@ -448,7 +456,7 @@ def bitmask_to_bool_ndarray(
448
456
def set_nulls (
449
457
data : Union [np .ndarray , pd .Series ],
450
458
col : Column ,
451
- validity : Tuple [Buffer , Tuple [DtypeKind , int , str , str ]],
459
+ validity : Optional [ Tuple [Buffer , Tuple [DtypeKind , int , str , str ] ]],
452
460
allow_modify_inplace : bool = True ,
453
461
):
454
462
"""
@@ -478,6 +486,7 @@ def set_nulls(
478
486
if null_kind == ColumnNullType .USE_SENTINEL :
479
487
null_pos = data == sentinel_val
480
488
elif null_kind in (ColumnNullType .USE_BITMASK , ColumnNullType .USE_BYTEMASK ):
489
+ assert validity , "Expected to have a validity buffer for the mask"
481
490
valid_buff , valid_dtype = validity
482
491
null_pos = buffer_to_ndarray (valid_buff , valid_dtype , col .offset , col .size )
483
492
if sentinel_val == 0 :
0 commit comments