Skip to content

Commit ea10934

Browse files
committed
Fix gh-15086 properly instead of making a workaround
1 parent e8d951d commit ea10934

File tree

2 files changed

+33
-5
lines changed

2 files changed

+33
-5
lines changed

pandas/_libs/parsers.pyx

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -678,11 +678,7 @@ cdef class TextReader:
678678

679679
if isinstance(source, basestring):
680680
if not isinstance(source, bytes):
681-
if compat.PY36 and compat.is_platform_windows():
682-
# see gh-15086.
683-
encoding = "mbcs"
684-
else:
685-
encoding = sys.getfilesystemencoding() or "utf-8"
681+
encoding = sys.getfilesystemencoding() or "utf-8"
686682

687683
source = source.encode(encoding)
688684

pandas/_libs/src/parser/io.c

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ The full license is in the LICENSE file, distributed with this software.
1717
#define O_BINARY 0
1818
#endif // O_BINARY
1919

20+
#if PY_VERSION_HEX >= 0x03060000 && defined(_WIN32)
21+
#define USE_WIN_UTF16
22+
#include <Windows.h>
23+
#endif
24+
2025
/*
2126
On-disk FILE, uncompressed
2227
*/
@@ -27,7 +32,34 @@ void *new_file_source(char *fname, size_t buffer_size) {
2732
return NULL;
2833
}
2934

35+
#ifdef USE_WIN_UTF16
36+
// Fix gh-15086 properly - convert UTF8 to UTF16 that Windows widechar API
37+
// accepts. This is needed because UTF8 might _not_ be convertible to MBCS
38+
// for some conditions, as MBCS is locale-dependent, and not all unicode
39+
// symbols can be expressed in it.
40+
{
41+
wchar_t* wname = NULL;
42+
int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
43+
if (required == 0) {
44+
free(fs);
45+
return NULL;
46+
}
47+
wname = (wchar_t*)malloc(required * sizeof(wchar_t));
48+
if (wname == NULL) {
49+
free(fs);
50+
return NULL;
51+
}
52+
if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) < required) {
53+
free(wname);
54+
free(fs);
55+
return NULL;
56+
}
57+
fs->fd = _wopen(wname, O_RDONLY | O_BINARY);
58+
free(wname);
59+
}
60+
#else
3061
fs->fd = open(fname, O_RDONLY | O_BINARY);
62+
#endif
3163
if (fs->fd == -1) {
3264
free(fs);
3365
return NULL;

0 commit comments

Comments
 (0)