Source code for enadepy.transform

# The MIT License (MIT)
#
# Copyright (c) 2020 M. Choji
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""A set of functions that transform a dataset in any way."""

import pandas
from pandas.api.types import CategoricalDtype
from typing import List, TypeVar

from .index import get_index_dict
from .loaders import _dtypes

PandasSeries = TypeVar('PandasSeries', bound=pandas.core.frame.Series)
PandasDataFrame = TypeVar('PandasDataFrame', bound=pandas.core.frame.DataFrame)


def _label_co_turno_graduacao(row: PandasSeries) -> int:
    """Maps time period given from binary columns to category.

    This function maps the time period of a course indicated by the
    combination of three binary columns to a single value used as
    category.
    Currently, this is used to adjust the Enade microdata from 2016.
    Note: this function is not intended to be used externally.

    Args:
        row (PandasSeries): A row from Enade microdata.

    Returns:
        int: A value indicating the time period (category).
    """
    if row['IN_MATUT'] + row['IN_NOTURNO'] + row['IN_VESPER'] > 1:
        return 3
    elif row['IN_MATUT'] == 1:
        return 1
    elif row['IN_NOTURNO'] == 1:
        return 4
    else:
        return 2


[docs]def align_microdata_2016(filepath: str, output: str) -> None: """Changes Enade microdata from 2016 to match newer versions. Args: filepath (str): Path for the original data. output (str): Path for the output (converted) data. """ df = pandas.read_csv(filepath, sep=';', dtype=_dtypes) df.rename(columns={'ANO_FIM_2G': 'ANO_FIM_EM'}, inplace=True) df = df.reindex( df.columns.tolist() + ['TP_INSCRICAO', 'TP_INSCRICAO_ADM'], axis=1 ) df['CO_TURNO_GRADUACAO'] = df.apply( lambda x: _label_co_turno_graduacao(x), axis=1 ) df.drop( columns=[ 'AMOSTRA', 'ID_STATUS', 'IN_GRAD', 'TP_SEMESTRE', 'IN_MATUT', 'IN_NOTURNO', 'IN_VESPER' ], inplace=True ) df.to_csv(output, sep=';', index=False, decimal=',')
[docs]def categorize( dataframe: PandasDataFrame, columns: List[str], only_current: bool = False ) -> PandasDataFrame: """Converts columns of a DataFrame to categorical type. Given a DataFrame, convert the given columns into categorical type according to predefined categories. Args: dataframe (PandasDataFrame): A pandas DataFrame containing Enade microdata. columns (List[str]): A list of columns to be converted to categorical type. only_current (bool, optional): If true, uses only currently present values as categories, not the predefined ones. Defaults to False. Returns: PandasDataFrame: A new DataFrame with the converted columns. """ if not isinstance(dataframe, pandas.DataFrame): raise TypeError( 'Argument "dataframe" should be of type pandas.DataFrame' ) if not isinstance(columns, list): raise TypeError('Argument "columns" should be of type list') result = dataframe.copy() for col in columns: try: idx_col = get_index_dict(col) except NameError: result.loc[:, col] = result[col].astype('category') else: cats = list(idx_col.keys()) cat_type = CategoricalDtype(cats) result.loc[:, col] = result[col].astype(cat_type) if only_current: result.loc[:, col].cat.remove_unused_categories() return result