Python Pandas Module: A Comprehensive Guide to Data Manipulation and Analysis

August 28, 2024

Modul Python yang menunjukkan fitur-fitur kunci dari pustaka Pandas dan mencakup contoh implementasi di industri. Izinkan saya menjelaskan bagian-bagian utama:

Pandas Data Structures:
- Membuat dan menampilkan contoh Pandas Series dan DataFrame
- Menunjukkan cara mengakses data dalam struktur-struktur ini.
Data Transformation and Manipulation:
- Menunjukkan penyaringan, pengurutan, pengelompokan, dan penerapan fungsi.
- Menunjukkan cara menangani data yang hilang dan melakukan operasi matematika
Data Cleaning and Preprocessing:
- Kami menangani pencilan dengan membatasi nilai-nilai.
- Kami menunjukkan cara menghapus duplikat dan mengubah tipe data.
Joining, Merging, and Reshaping:
- Kita menggabungkan dua DataFrame.
- Kami menunjukkan operasi pemutaran dan peleburan..
Industry Implementations:

Kami membuat fungsi untuk perencanaan produksi, manajemen gudang, dan analisis fintech.

Pandas Demonstration module :

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# 1. Pandas Data Structures
print("1. Pandas Data Structures")

# Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print("Series:")
print(s)

# DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': pd.date_range(start='2023-01-01', periods=4),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(["test", "train", "test", "train"]),
    'F': 'foo'
})
print("\nDataFrame:")
print(df)

# Accessing data
print("\nAccessing data:")
print(df['A'])  # Accessing a column
print(df.loc[0])  # Accessing a row by label
print(df.iloc[1])  # Accessing a row by integer index

# 2. Data Transformation and Manipulation
print("\n2. Data Transformation and Manipulation")

# Filtering
print("Filtered DataFrame (A > 2):")
print(df[df['A'] > 2])

# Sorting
print("\nSorted DataFrame (by column A, descending):")
print(df.sort_values(by='A', ascending=False))

# Grouping
print("\nGrouped DataFrame (mean of numeric columns, grouped by E):")
print(df.groupby('E').mean())

# Applying functions
print("\nApplying a function (multiply column A by 2):")
print(df['A'].apply(lambda x: x * 2))

# Handling missing data
df.loc[1, 'A'] = np.nan
print("\nDataFrame with missing value:")
print(df)
print("\nFilling missing values with 0:")
print(df.fillna(0))

# Mathematical operations
print("\nMean of numeric columns:")
print(df.mean())

# 3. Data Cleaning and Preprocessing
print("\n3. Data Cleaning and Preprocessing")

# Handling outliers (example: capping values)
df['A'] = df['A'].clip(upper=3)
print("DataFrame with capped values (A <= 3):")
print(df)

# Removing duplicates
df_dup = pd.concat([df, df.iloc[0:2]])
print("\nDataFrame with duplicates:")
print(df_dup)
print("\nDataFrame with duplicates removed:")
print(df_dup.drop_duplicates())

# Data type conversion
df['A'] = df['A'].astype('float64')
print("\nDataFrame with column A converted to float64:")
print(df.dtypes)

# 4. Joining, Merging, and Reshaping
print("\n4. Joining, Merging, and Reshaping")

# Creating another DataFrame for merging
df2 = pd.DataFrame({
    'A': [5, 6, 7, 8],
    'B': pd.date_range(start='2023-01-05', periods=4),
    'G': ['alpha', 'beta', 'gamma', 'delta']
})

# Merging DataFrames
merged_df = pd.merge(df, df2, on='A', how='outer')
print("Merged DataFrame:")
print(merged_df)

# Pivoting
df_pivot = df.pivot(columns='E', values='A')
print("\nPivoted DataFrame:")
print(df_pivot)

# Melting
df_melted = pd.melt(df, id_vars=['B'], value_vars=['A', 'C'])
print("\nMelted DataFrame:")
print(df_melted)

# 5. Industry Implementations

def production_planning():
    # Production planning and optimization
    production_data = pd.DataFrame({
        'Product': ['A', 'B', 'C', 'D'],
        'Demand': [100, 150, 80, 120],
        'Production_Rate': [10, 15, 8, 12],
        'Raw_Material_Cost': [50, 60, 40, 55]
    })
    
    production_data['Production_Time'] = production_data['Demand'] / production_data['Production_Rate']
    production_data['Total_Cost'] = production_data['Demand'] * production_data['Raw_Material_Cost']
    
    print("Production Planning Optimization:")
    print(production_data)
    print("\nTotal Production Time:", production_data['Production_Time'].sum())
    print("Total Raw Material Cost:", production_data['Total_Cost'].sum())

def warehouse_management():
    # Warehouse and logistics management
    inventory = pd.DataFrame({
        'Item': ['Widget', 'Gadget', 'Doohickey', 'Gizmo'],
        'Quantity': [500, 300, 200, 400],
        'Reorder_Point': [100, 50, 75, 150],
        'Lead_Time_Days': [5, 7, 3, 6]
    })
    
    inventory['Need_Reorder'] = inventory['Quantity'] <= inventory['Reorder_Point']
    print("Warehouse Inventory Status:")
    print(inventory)
    print("\nItems Needing Reorder:")
    print(inventory[inventory['Need_Reorder']])

def fintech_analysis():
    # Financial technology (FinTech) solutions
    transactions = pd.DataFrame({
        'Date': pd.date_range(start='2023-01-01', periods=100),
        'Amount': np.random.randn(100) * 100 + 500,
        'Category': np.random.choice(['Food', 'Transport', 'Entertainment', 'Bills'], 100)
    })
    
    print("FinTech Transaction Analysis:")
    print(transactions.groupby('Category').agg({'Amount': ['sum', 'mean', 'count']}))

# Run industry implementations
production_planning()
print("\n" + "="*50 + "\n")
warehouse_management()
print("\n" + "="*50 + "\n")
fintech_analysis()

1. Pandas Data Structures

Pandas Series

A Pandas Series is a one-dimensional labeled array capable of holding any data type. It is similar to a Python list or a NumPy array but with additional capabilities.

python

import pandas as pd

# Creating a Pandas Series
data = [10, 20, 30, 40, 50]
series = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])

# Accessing elements in a Series
print(series['a'])  # Output: 10

# Series operations
print(series * 2)  # Element-wise multiplication

Pandas DataFrame

A Pandas DataFrame is a two-dimensional labeled data structure with columns of potentially different types. It can be thought of as a table or a spreadsheet in Python.

# Creating a Pandas DataFrame
data = {
    'Product': ['A', 'B', 'C', 'D'],
    'Price': [100, 150, 200, 250],
    'Quantity': [10, 20, 15, 25]
}
df = pd.DataFrame(data)

# Accessing elements in a DataFrame
print(df['Product'])  # Accessing a column
print(df.loc[0])      # Accessing a row

2. Data Transformation and Manipulation

Filtering and Sorting

Filtering allows you to select specific rows based on conditions, while sorting allows you to arrange data in ascending or descending order.

# Filtering data
filtered_df = df[df['Price'] > 150]

# Sorting data by a column
sorted_df = df.sort_values(by='Price', ascending=False)

Grouping and Applying Functions

Grouping data allows you to split the data into groups based on some criteria and then apply a function to each group.

# Grouping data by a column and calculating the sum
grouped_df = df.groupby('Product').sum()

# Applying a custom function to each group
def discount(price):
    return price * 0.9

df['Discounted_Price'] = df['Price'].apply(discount)

Handling Missing Data

Handling missing data is crucial in data analysis. Pandas provides several methods to fill, interpolate, or drop missing values.

# Creating a DataFrame with missing values
df_with_nan = pd.DataFrame({
    'A': [1, 2, None, 4],
    'B': [None, 2, 3, 4]
})

# Filling missing values with a constant
df_filled = df_with_nan.fillna(0)

# Dropping rows with missing values
df_dropped = df_with_nan.dropna()

# Interpolating missing values
df_interpolated = df_with_nan.interpolate()

Mathematical and Statistical Operations

Pandas allows you to perform a variety of mathematical and statistical operations on DataFrames.

# Calculating the mean of a column
mean_price = df['Price'].mean()

# Calculating the sum of a column
total_quantity = df['Quantity'].sum()

# Adding a new column with a calculated value
df['Total_Value'] = df['Price'] * df['Quantity']

3. Data Cleaning and Preprocessing

Handling Outliers

Outliers can distort the results of data analysis. You can use various methods to detect and handle outliers.

# Detecting outliers using the IQR method
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)
IQR = Q3 - Q1

outliers = df[(df['Price'] < (Q1 - 1.5 * IQR)) | (df['Price'] > (Q3 + 1.5 * IQR))]

Removing Duplicates

Duplicate rows can be identified and removed to ensure the data's integrity.

# Removing duplicate rows
df_unique = df.drop_duplicates()

Data Type Conversion

Converting data types is often necessary when performing certain operations or preparing data for analysis.

# Converting a column to a different data type
df['Price'] = df['Price'].astype(float)

Data Normalization and Feature Engineering

Normalization scales the data into a specific range, while feature engineering creates new features from the existing data.

# Normalizing a column
df['Normalized_Price'] = (df['Price'] - df['Price'].min()) / (df['Price'].max() - df['Price'].min())

# Creating a new feature based on existing data
df['Value_Per_Unit'] = df['Total_Value'] / df['Quantity']

4. Joining, Merging, and Reshaping

Joining and Merging DataFrames

Joining or merging DataFrames is essential when working with related data from different sources.

# Creating two DataFrames
df1 = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['A', 'B', 'C']})
df2 = pd.DataFrame({'ID': [1, 2, 4], 'Age': [25, 30, 35]})

# Merging the DataFrames on a common column
merged_df = pd.merge(df1, df2, on='ID', how='inner')

Reshaping DataFrames

Reshaping allows you to change the structure of a DataFrame, such as pivoting, melting, or stacking/unstacking data.

# Pivoting a DataFrame
pivot_df = df.pivot(index='Product', columns='Quantity', values='Price')

# Melting a DataFrame
melted_df = df.melt(id_vars=['Product'], value_vars=['Price', 'Quantity'])

# Stacking and Unstacking
stacked_df = df.stack()
unstacked_df = stacked_df.unstack()

5. Industry-Specific Implementations

Production Planning and Optimization

Example: Using Pandas to optimize production schedules by analyzing production data, including quantities, costs, and time requirements.

import pandas as pd

# Example DataFrame for production planning
production_data = {
    'Machine': ['M1', 'M2', 'M3'],
    'Hours_Required': [10, 5, 8],
    'Output_Per_Hour': [20, 25, 15]
}
production_df = pd.DataFrame(production_data)

# Calculate total production output
production_df['Total_Output'] = production_df['Hours_Required'] * production_df['Output_Per_Hour']

# Sort machines by efficiency
sorted_machines = production_df.sort_values(by='Total_Output', ascending=False)
print(sorted_machines)

Warehouse and Logistics Management

Example: Managing inventory levels and tracking shipments using Pandas.

# Example DataFrame for warehouse inventory
inventory_data = {
    'Item': ['A', 'B', 'C'],
    'Stock': [100, 200, 150],
    'Reorder_Level': [50, 100, 75]
}
inventory_df = pd.DataFrame(inventory_data)

# Identify items that need to be reordered
to_reorder = inventory_df[inventory_df['Stock'] < inventory_df['Reorder_Level']]
print(to_reorder)

Financial Technology (FinTech) Solutions

Example: Analyzing customer transaction data to identify trends and calculate metrics like average transaction value.

# Example DataFrame for customer transactions
transaction_data = {
    'Customer_ID': [101, 102, 103, 101, 102],
    'Transaction_Amount': [100, 150, 200, 250, 300],
    'Transaction_Date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05'])
}
transaction_df = pd.DataFrame(transaction_data)

# Calculate average transaction amount per customer
average_transaction = transaction_df.groupby('Customer_ID')['Transaction_Amount'].mean()
print(average_transaction)

Banking and Financial Services

Example: Managing customer account data and analyzing interest rates for different accounts.

# Example DataFrame for customer accounts
account_data = {
    'Account_Number': [1001, 1002, 1003],
    'Balance': [5000, 10000, 15000],
    'Interest_Rate': [0.02, 0.03, 0.025]
}
account_df = pd.DataFrame(account_data)

# Calculate interest for each account
account_df['Interest'] = account_df['Balance'] * account_df['Interest_Rate']
print(account_df)

E-commerce Platforms

Example: Analyzing sales data to identify top-selling products and customer behavior patterns.

# Example DataFrame for e-commerce sales
sales_data = {
    'Product': ['P1', 'P2', 'P3', 'P1', 'P2'],
    'Quantity_Sold': [10, 20, 15, 25, 30],
    'Sales_Amount': [100, 200, 150, 250, 300]
}
sales_df = pd.DataFrame(sales_data)

# Calculate total sales for each product
total_sales = sales_df.groupby('Product')['Sales_Amount'].sum()
print(total_sales)

Insurance and Risk Management

Example: Evaluating insurance claims data to identify risk factors and calculate average claim amounts.

# Example DataFrame for insurance claims
claims_data = {
    'Claim_ID': [1001, 1002, 1003, 1004],
    'Claim_Amount': [5000, 10000, 7000, 12000],
    'Claim_Type': ['Theft', 'Accident', 'Theft', 'Accident']
}
claims_df = pd.DataFrame(claims_data)

# Calculate average claim amount per type
average_claim = claims_df.groupby('Claim_Type')['Claim_Amount'].mean()
print(average_claim)

Maintenance and Asset Management

Example: Tracking maintenance schedules and costs for machinery in a manufacturing plant.

# Example DataFrame for maintenance schedules
maintenance_data = {
    'Machine_ID': ['M1', 'M2', 'M3'],
    'Last_Maintenance': pd.to_datetime(['2023-01-01', '2023-01-15', '2023-02-01']),
    'Maintenance_Cost': [500, 700, 600]
}
maintenance_df = pd.DataFrame(maintenance_data)

# Calculate the total maintenance cost
total_maintenance_cost = maintenance_df['Maintenance_Cost'].sum()
print(total_maintenance_cost)

Project Management and Task Automation

Example: Managing project timelines and resources, calculating total project time and resource allocation.

# Example DataFrame for project tasks
project_data = {
    'Task': ['T1', 'T2', 'T3'],
    'Duration_Days': [5, 10, 7],
    'Resources_Allocated': [2, 3, 4]
}
project_df = pd.DataFrame(project_data)

# Calculate total project duration
total_duration = project_df['Duration_Days'].sum()
print(total_duration)

Quality Management and Process Improvement

Example: Monitoring quality metrics such as defect rates in production processes.

# Example DataFrame for quality control
quality_data = {
    'Batch_ID': ['B1', 'B2', 'B3'],
    'Defects': [5, 10, 3],
    'Total_Produced': [100, 200, 150]
}
quality_df = pd.DataFrame(quality_data)

# Calculate defect rate for each batch
quality_df['Defect_Rate'] = quality_df['Defects'] / quality_df['Total_Produced']
print(quality_df)

Administrative and Office Automation

Example: Automating administrative tasks such as generating reports from employee data.

# Example DataFrame for employee records
employee_data = {
    'Employee_ID': [101, 102, 103],
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Salary': [50000, 60000, 55000]
}
employee_df = pd.DataFrame(employee_data)

# Calculate total payroll cost
total_salary = employee_df['Salary'].sum()
print(total_salary)

Travel and Hospitality Management

Example: Managing bookings and reservations data for a hotel.

# Example DataFrame for hotel bookings
booking_data = {
    'Booking_ID': [1001, 1002, 1003],
    'Guest_Name': ['John', 'Jane', 'Doe'],
    'Room_Number': [101, 102, 103],
    'Nights_Stayed': [3, 2, 4]
}
booking_df = pd.DataFrame(booking_data)

# Calculate total revenue from bookings
booking_df['Revenue'] = booking_df['Nights_Stayed'] * 100  # Assuming $100 per night
total_revenue = booking_df['Revenue'].sum()
print(total_revenue)