ENH: add to/from_parquet with pyarrow & fastparquet

jreback · jreback · commit 3d26bd6cd62a · 2017-03-29T18:21:57.000-04:00
diff --git a/ci/requirements-2.7.sh b/ci/requirements-2.7.sh
@@ -4,4 +4,4 @@ source activate pandas
 
 echo "install 27"
 
-conda install -n pandas -c conda-forge feather-format
+conda install -n pandas -c conda-forge feather-format pyarrow fastparquet
diff --git a/ci/requirements-3.5.sh b/ci/requirements-3.5.sh
@@ -4,4 +4,4 @@ source activate pandas
 
 echo "install 35"
 
-conda install -n pandas -c conda-forge feather-format
+conda install -n pandas -c conda-forge feather-format pyarrow
diff --git a/ci/requirements-3.5_DOC.sh b/ci/requirements-3.5_DOC.sh
@@ -6,6 +6,6 @@ echo "[install DOC_BUILD deps]"
 
 pip install pandas-gbq
 
-conda install -n pandas -c conda-forge feather-format
+conda install -n pandas -c conda-forge feather-format pyarrow fastparquet
 
 conda install -n pandas -c r r rpy2 --yes
diff --git a/ci/requirements-3.5_OSX.sh b/ci/requirements-3.5_OSX.sh
@@ -4,4 +4,4 @@ source activate pandas
 
 echo "install 35_OSX"
 
-conda install -n pandas -c conda-forge feather-format
+conda install -n pandas -c conda-forge feather-format fastparquet
diff --git a/ci/requirements-3.6.pip b/ci/requirements-3.6.pip
@@ -0,0 +1 @@
+brotlipy
diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run
@@ -15,6 +15,9 @@ jinja2
 sqlalchemy
 pymysql
 feather-format
+pyarrow
+python-snappy
+fastparquet
 # psycopg2 (not avail on defaults ATM)
 beautifulsoup4
 s3fs
diff --git a/ci/requirements-3.6_WIN.run b/ci/requirements-3.6_WIN.run
@@ -11,3 +11,6 @@ numexpr
 pytables
 matplotlib
 blosc
+fastparquet
+# not supported currently
+# pyarrow
diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -236,6 +236,7 @@ Optional Dependencies
 * `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
 * `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
 * `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
+* ``Parquet Format``, either `pyarrow <https://pyarrow.readthedocs.io/en/latest/>`__ or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
 * `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
 
   * `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -43,6 +43,7 @@ object. The corresponding ``writer`` functions are object methods that are acces
     binary;`MS Excel <https://en.wikipedia.org/wiki/Microsoft_Excel>`__;:ref:`read_excel<io.excel_reader>`;:ref:`to_excel<io.excel_writer>`
     binary;`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__;:ref:`read_hdf<io.hdf5>`;:ref:`to_hdf<io.hdf5>`
     binary;`Feather Format <https://github.com/wesm/feather>`__;:ref:`read_feather<io.feather>`;:ref:`to_feather<io.feather>`
+    binary;`Parquet Format <https://parquet.apache.org/>`__;:ref:`read_parquet<io.parquet>`;:ref:`to_parquet<io.parquet>`
     binary;`Msgpack <http://msgpack.org/index.html>`__;:ref:`read_msgpack<io.msgpack>`;:ref:`to_msgpack<io.msgpack>`
     binary;`Stata <https://en.wikipedia.org/wiki/Stata>`__;:ref:`read_stata<io.stata_reader>`;:ref:`to_stata<io.stata_writer>`
     binary;`SAS <https://en.wikipedia.org/wiki/SAS_(software)>`__;:ref:`read_sas<io.sas_reader>`;
@@ -4505,6 +4506,69 @@ Read from a feather file.
    import os
    os.remove('example.feather')
 
+
+.. _io.parquet:
+
+Parquet
+-------
+
+.. versionadded:: 0.20.0
+
+Parquet provides a sharded binary columnar serialization for data frames. It is designed to make reading and writing data
+frames efficient, and to make sharing data across data analysis languages easy.
+
+Parquet is designed to faithfully serialize and de-serialize DataFrames, supporting all of the pandas
+dtypes, including extension dtypes such as categorical and datetime with tz.
+
+Several caveats.
+
+- The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an
+  error if a non-default one is provided. You can simply ``.reset_index()`` in order to store the index.
+- Duplicate column names and non-string columns names are not supported
+- Non supported types include ``Period`` and actual python object types. These will raise a helpful error message
+  on an attempt at serialization.
+
+See the documentation for `pyarrow <https://pyarrow.readthedocs.io/en/latest/` and `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`
+
+.. ipython:: python
+
+   df = pd.DataFrame({'a': list('abc'),
+                      'b': list(range(1, 4)),
+                      'c': np.arange(3, 6).astype('u1'),
+                      'd': np.arange(4.0, 7.0, dtype='float64'),
+                      'e': [True, False, True],
+                      'f': pd.Categorical(list('abc')),
+                      'g': pd.date_range('20130101', periods=3),
+                      'h': pd.date_range('20130101', periods=3, tz='US/Eastern'),
+                      'i': pd.date_range('20130101', periods=3, freq='ns')})
+
+   df
+   df.dtypes
+
+Write to a parquet file.
+
+.. ipython:: python
+
+   df.to_parquet('example_pa.pq', engine='pyarrow')
+   df.to_parquet('example_fp.pq', engine='fastparquet')
+
+Read from a parquet file.
+
+.. ipython:: python
+
+   result = pd.read_parquet('example_pa.pq')
+   result = pd.read_parquet('example_fp.pq')
+
+   # we preserve dtypes
+   result.dtypes
+
+.. ipython:: python
+   :suppress:
+
+   import os
+   os.remove('example_pa.pq')
+   os.remove('example_fp.pq')
+
 .. _io.sql:
 
 SQL Queries
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -274,6 +274,7 @@ Other Enhancements
 ^^^^^^^^^^^^^^^^^^
 
 - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here <io.feather>`.
+- Integration with the ``parquet-format``, including a new top-level ``pd.read_parquet()`` and ``DataFrame.to_parquet()`` method, see :ref:`here <io.parquet>`.
 - ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`)
 - ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1520,6 +1520,25 @@ def to_feather(self, fname):
         from pandas.io.feather_format import to_feather
         to_feather(self, fname)
 
+    def to_parquet(self, fname, engine, compression=None):
+        """
+        write out the binary parquet for DataFrames
+
+        .. versionadded:: 0.20.0
+
+        Parameters
+        ----------
+        fname : str
+            string file path
+        engine : parquet engine
+            supported are {'pyarrow', 'fastparquet'}
+        compression : str, optional
+            compression method, includes {'gzip', 'snappy', 'brotli'}
+
+        """
+        from pandas.io.parquet import to_parquet
+        to_parquet(self, fname, engine, compression=compression)
+
     @Substitution(header='Write out column names. If a list of string is given, \
 it is assumed to be aliases for the column names')
     @Appender(fmt.docstring_to_string, indents=1)
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -0,0 +1,140 @@
+""" parquet compat """
+
+from warnings import catch_warnings
+from pandas import DataFrame, RangeIndex, Int64Index
+from pandas.compat import range
+
+
+def _try_import_pyarrow():
+    # since pandas is a dependency of pyarrow
+    # we need to import on first use
+
+    try:
+        import pyarrow
+    except ImportError:
+        raise ImportError("pyarrow is required for parquet support\n\n"
+                          "you can install via conda\n"
+                          "conda install pyarrow -c conda-forge\n"
+                          "\nor via pip\n"
+                          "pip install pyarrow\n")
+
+    return pyarrow
+
+
+def _try_import_fastparquet():
+    # since pandas is a dependency of fastparquet
+    # we need to import on first use
+
+    try:
+        import fastparquet
+    except ImportError:
+        raise ImportError("fastparquet is required for parquet support\n\n"
+                          "you can install via conda\n"
+                          "conda install fastparquet -c conda-forge\n"
+                          "\nor via pip\n"
+                          "pip install fastparquet")
+
+    return fastparquet
+
+
+def _validate_engine(engine):
+    if engine not in ['pyarrow', 'fastparquet']:
+        raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
+
+
+def to_parquet(df, path, engine, compression=None):
+    """
+    Write a DataFrame to the pyarrow
+
+    Parameters
+    ----------
+    df : DataFrame
+    path : string
+        File path
+    engine : parquet engine
+        supported are {'pyarrow', 'fastparquet'}
+    compression : str, optional
+        compression method, includes {'gzip', 'snappy', 'brotli'}
+    """
+
+    _validate_engine(engine)
+
+    if not isinstance(df, DataFrame):
+        raise ValueError("to_parquet only support IO with DataFrames")
+
+    valid_types = {'string', 'unicode'}
+
+    # validate index
+    # --------------
+
+    # validate that we have only a default index
+    # raise on anything else as we don't serialize the index
+
+    if not isinstance(df.index, Int64Index):
+        raise ValueError("parquet does not serializing {} "
+                         "for the index; you can .reset_index()"
+                         "to make the index into column(s)".format(
+                             type(df.index)))
+
+    if not df.index.equals(RangeIndex.from_range(range(len(df)))):
+        raise ValueError("parquet does not serializing a non-default index "
+                         "for the index; you can .reset_index()"
+                         "to make the index into column(s)")
+
+    if df.index.name is not None:
+        raise ValueError("parquet does not serialize index meta-data on a "
+                         "default index")
+
+    # validate columns
+    # ----------------
+
+    # must have value column names (strings only)
+    if df.columns.inferred_type not in valid_types:
+        raise ValueError("parquet must have string column names")
+
+    if engine == 'pyarrow':
+        pyarrow = _try_import_pyarrow()
+        from pyarrow import parquet as pq
+
+        table = pyarrow.Table.from_pandas(df)
+        pq.write_table(table, path, compression=compression)
+
+    elif engine == 'fastparquet':
+        fastparquet = _try_import_fastparquet()
+
+        # thriftpy/protocol/compact.py:339:
+        # DeprecationWarning: tostring() is deprecated.
+        # Use tobytes() instead.
+        with catch_warnings(record=True):
+            fastparquet.write(path, df, compression=compression)
+
+
+def read_parquet(path, engine):
+    """
+    Load a parquet object from the file path
+
+    .. versionadded 0.20.0
+
+    Parameters
+    ----------
+    path : string
+        File path
+    engine : parquet engine
+        supported are {'pyarrow', 'fastparquet'}
+
+    Returns
+    -------
+    type of object stored in file
+
+    """
+
+    _validate_engine(engine)
+
+    if engine == 'pyarrow':
+        pyarrow = _try_import_pyarrow()
+        return pyarrow.parquet.read_table(path).to_pandas()
+
+    elif engine == 'fastparquet':
+        fastparquet = _try_import_fastparquet()
+        pf = fastparquet.ParquetFile(path)
+        return pf.to_pandas()
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py

Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@ source activate pandas`
`4`	`4`
`5`	`5`	`echo "install 27"`
`6`	`6`
`7`		`-conda install -n pandas -c conda-forge feather-format`
	`7`	`+conda install -n pandas -c conda-forge feather-format pyarrow fastparquet`
Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@ source activate pandas`
`4`	`4`
`5`	`5`	`echo "install 35"`
`6`	`6`
`7`		`-conda install -n pandas -c conda-forge feather-format`
	`7`	`+conda install -n pandas -c conda-forge feather-format pyarrow`
Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@ source activate pandas`
`4`	`4`
`5`	`5`	`echo "install 35_OSX"`
`6`	`6`
`7`		`-conda install -n pandas -c conda-forge feather-format`
	`7`	`+conda install -n pandas -c conda-forge feather-format fastparquet`