I need to predict different types of exploitation using a RandomForestClassifier. My dataset contains several categorical variables such as gender, citizenship, and CountryOfExploitation. These variables are not ordinal, meaning there is no inherent order in their values. Here is a snippet of my dataset and also a link to the dataset incase you need more information
https://www.ctdatacollaborative.org/dataset/global-synthetic-data-and-resources/resource/microdata
| yearOfRegistration | gender | ageBroad | citizenship | CountryOfExploitation | traffickMonths | meansDebtBondageEarnings | meansThreats | meansAbusePsyPhySex | meansFalsePromises | meansDrugsAlcohol | meansDenyBasicNeeds | meansExcessiveWorkHours | meansWithholdDocs | isForcedLabour | isSexualExploit | isOtherExploit | typeOfLabourAgriculture | typeOfLabourConstruction | typeOfLabourDomesticWork | typeOfLabourHospitality | typeOfSexProstitution | typeOfSexPornography | recruiterRelationIntimatePartner | recruiterRelationFriend | recruiterRelationFamily | recruiterRelationOther |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2014 | Man | 30--38 | UKR | RUS | 0--12 (0-1 yr) | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | |||||||||||||
| 2014 | Man | 30--38 | UKR | RUS | 0--12 (0-1 yr) | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | |||||||||||||
| 2014 | Man | 30--38 | UKR | RUS | 0--12 (0-1 yr) | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | |||||||||||||
| 2014 | Man | 30--38 | UKR | RUS | 0--12 (0-1 yr) | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | |||||||||||||
| 2014 | Man | 30--38 | UKR | RUS | 0--12 (0-1 yr) | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | |||||||||||||
| 2014 | Man | 30--38 | UKR | RUS | 0--12 (0-1 yr) | 1 |
Initially, I used LabelEncoder to encode these variables, However, I realized that LabelEncoder is not suitable because it treats the categories as ordinal, which they are not. I then tried using one-hot encoding, but it didn't work well with my model. Here is the code I used for one-hot encoding:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import partial_dependence
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import category_encoders as ce
# Load the data
df = pd.read_csv('CTDC_global_synthetic_data_v2024.csv', low_memory=False)
# Drop 'traffickMonths' and rows with missing values in critical columns
df = df.drop(columns=['traffickMonths'])
critical_columns = ['yearOfRegistration', 'gender', 'ageBroad', 'citizenship', 'CountryOfExploitation']
df = df.dropna(subset=critical_columns)
# Drop rows where all three indicators are missing
df = df.dropna(subset=['isSexualExploit', 'isForcedLabour', 'isOtherExploit'], how='all')
# Fill NaN in 'isSexualExploit', 'isForcedLabour', and 'isOtherExploit' with 0 to simplify checks
df[['isSexualExploit', 'isForcedLabour', 'isOtherExploit']] = df[['isSexualExploit', 'isForcedLabour', 'isOtherExploit']].fillna(0)
# Convert to integers (0 or 1)
df[['isSexualExploit', 'isForcedLabour', 'isOtherExploit']] = df[['isSexualExploit', 'isForcedLabour', 'isOtherExploit']].astype(int)
# Define target based on conditions
conditions = [
(df['isForcedLabour'] == 1) & (df['isSexualExploit'] == 0) & (df['isOtherExploit'] == 0), # Only Forced Labour
(df['isForcedLabour'] == 0) & (df['isSexualExploit'] == 1) & (df['isOtherExploit'] == 0), # Only Sexual Exploitation
(df['isForcedLabour'] == 0) & (df['isSexualExploit'] == 0) & (df['isOtherExploit'] == 1), # Only Other Exploit
(df['isForcedLabour'] == 1) & (df['isSexualExploit'] == 1) & (df['isOtherExploit'] == 0) # Both Sexual Exploit and Forced Labour
]
choices = [1, 2, 3, 4]
# Create the target column
df['exploitType'] = np.select(conditions, choices, default=0)
# Filter out any rows where 'exploitType' is 0 if they do not meet any of the conditions
df = df[df['exploitType'] != 0]
# Encode the categorical variables
label_encoders = {}
for column in critical_columns:
if df[column].dtype == 'object':
le = LabelEncoder()
df[column] = le.fit_transform(df[column])
label_encoders[column] = le
# Split the data into training and testing sets
X = df[critical_columns]
y = df['exploitType']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the RandomForest model
model = RandomForestClassifier()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Calculate accuracy, confusion matrix, and classification report
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
# Print the results
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
# Function to generate bar plot for partial dependence
def plot_partial_dependence_for_exploit_type(df, exploit_type, title, critical_columns, label_encoders):
X = df[critical_columns]
y = (df['exploitType'] == exploit_type).astype(int) # Binary target for the specific exploit type
model = RandomForestClassifier()
model.fit(X, y)
# Loop through each categorical column
for column in critical_columns:
if column in label_encoders:
# Calculate partial dependence
pdp_results = partial_dependence(model, X, features=[column])
# Map encoded values back to original categories
encoded_values = pdp_results['values'][0]
categories = label_encoders[column].inverse_transform(encoded_values.astype(int))
partial_dependence_values = pdp_results['average'][0]
# Create a bar plot
plt.figure(figsize=(10, 6))
plt.bar(categories, partial_dependence_values)
plt.xlabel('Category')
plt.ylabel('Partial Dependence')
plt.title(f'Partial Dependence of {title} on {column}')
plt.show()
# Generate partial dependence plots for each exploit type and each categorical variable
exploit_titles = ['Only Forced Labour', 'Only Sexual Exploitation', 'Only Other Exploit', 'Both Sexual Exploit and Forced Labour']
exploit_types = [1, 2, 3, 4]
for exploit_type, title in zip(exploit_types, exploit_titles):
plot_partial_dependence_for_exploit_type(df, exploit_type, title, critical_columns, label_encoders)
OUTPUT:
Accuracy: 0.9564681724845996
Confusion Matrix:
[[ 827 52 0 1]
[ 11 1488 0 5]
[ 2 1 5 0]
[ 0 34 0 9]]
Classification Report:
precision recall f1-score support
1 0.98 0.94 0.96 880
2 0.94 0.99 0.97 1504
3 1.00 0.62 0.77 8
4 0.60 0.21 0.31 43
accuracy 0.96 2435
macro avg 0.88 0.69 0.75 2435
weighted avg 0.95 0.96 0.95
I thought about doing One hot encoding instead but there are 50 countries in this data set and the partial dependency plots would not generate for one hot encoding. I forgot what i did for it because i deleted it.
I also tried target-encoding for the data but for some reason it isnt printing out the partial dependency plots for country of exploitation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import partial_dependence
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import category_encoders as ce
# Load the data
df = pd.read_csv('CTDC_global_synthetic_data_v2024.csv', low_memory=False)
# Drop 'traffickMonths' and rows with missing values in critical columns
df = df.drop(columns=['traffickMonths'])
critical_columns = ['yearOfRegistration', 'gender', 'ageBroad', 'citizenship', 'CountryOfExploitation']
df = df.dropna(subset=critical_columns)
# Drop rows where all three indicators are missing
df = df.dropna(subset=['isSexualExploit', 'isForcedLabour', 'isOtherExploit'], how='all')
# Fill NaN in 'isSexualExploit', 'isForcedLabour', and 'isOtherExploit' with 0 to simplify checks
df[['isSexualExploit', 'isForcedLabour', 'isOtherExploit']] = df[['isSexualExploit', 'isForcedLabour', 'isOtherExploit']].fillna(0)
# Convert to integers (0 or 1)
df[['isSexualExploit', 'isForcedLabour', 'isOtherExploit']] = df[['isSexualExploit', 'isForcedLabour', 'isOtherExploit']].astype(int)
# Drop rows where gender is Trans/Transgender/NonConforming
df = df[df['gender'] != 'Trans/Transgender/NonConforming']
# Define target based on conditions
conditions = [
(df['isForcedLabour'] == 1) & (df['isSexualExploit'] == 0) & (df['isOtherExploit'] == 0), # Only Forced Labour
(df['isForcedLabour'] == 0) & (df['isSexualExploit'] == 1) & (df['isOtherExploit'] == 0), # Only Sexual Exploitation
(df['isForcedLabour'] == 0) & (df['isSexualExploit'] == 0) & (df['isOtherExploit'] == 1), # Only Other Exploit
(df['isForcedLabour'] == 1) & (df['isSexualExploit'] == 1) & (df['isOtherExploit'] == 0) # Both Sexual Exploit and Forced Labour
]
choices = [1, 2, 3, 4]
# Create the target column
df['exploitType'] = np.select(conditions, choices, default=0)
# Filter out any rows where 'exploitType' is 0 if they do not meet any of the conditions
df = df[df['exploitType'] != 0]
# Print data distribution
print("Gender Distribution:\n", df['gender'].value_counts())
print("Exploit Type Distribution:\n", df['exploitType'].value_counts())
# Calculate value counts of gender within each exploitType category
gender_exploit_counts = df.groupby(['exploitType', 'gender']).size().unstack(fill_value=0)
print("Gender counts within each Exploit Type category:\n", gender_exploit_counts)
# Encode ordinal variables using LabelEncoder
label_encoders = {}
ordinal_columns = ['yearOfRegistration', 'ageBroad']
for column in ordinal_columns:
le = LabelEncoder()
df[column] = le.fit_transform(df[column])
label_encoders[column] = le
# Split the data into training and testing sets
X = df[critical_columns]
y = df['exploitType']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Target encode non-ordinal categorical variables using category_encoders
non_ordinal_columns = ['gender', 'citizenship', 'CountryOfExploitation']
target_encoder = ce.TargetEncoder(cols=non_ordinal_columns)
X_train_te = X_train.copy()
X_test_te = X_test.copy()
# Fit the target encoder on the training data and transform both training and test data
X_train_te = target_encoder.fit_transform(X_train_te, y_train)
X_test_te = target_encoder.transform(X_test_te)
# Train the RandomForest model
model = RandomForestClassifier()
model.fit(X_train_te, y_train)
# Make predictions
y_pred_te = model.predict(X_test_te)
# Calculate accuracy, confusion matrix, and classification report
accuracy_te = accuracy_score(y_test, y_pred_te)
conf_matrix_te = confusion_matrix(y_test, y_pred_te)
class_report_te = classification_report(y_test, y_pred_te)
# Print the results
print(f"Target Encoder Accuracy: {accuracy_te}")
print("Confusion Matrix:\n", conf_matrix_te)
print("Classification Report:\n", class_report_te)
# Display the head of the transformed training data
print(X_train_te.head())
# Function to generate bar plot for partial dependence
def plot_partial_dependence_for_exploit_type(X, y, title, columns, encoder, df):
model = RandomForestClassifier()
model.fit(X, y)
for column in columns:
pdp_results = partial_dependence(model, X, features=[column])
if column in non_ordinal_columns:
categories = df[column].unique()
else:
categories = label_encoders[column].inverse_transform(pdp_results['values'][0].astype(int))
if len(pdp_results['average'][0]) == len(categories):
plt.figure(figsize=(10, 6))
plt.bar(categories, pdp_results['average'][0])
plt.xlabel('Category')
plt.ylabel('Partial Dependence')
plt.title(f'Partial Dependence of {title} on {column}')
plt.show()
# Generate partial dependence plots for each exploit type and each categorical variable
all_columns = ['gender', 'citizenship', 'CountryOfExploitation', 'yearOfRegistration', 'ageBroad']
for exploit_type, title in zip([1, 2, 3, 4], ['Only Forced Labour', 'Only Sexual Exploitation', 'Only Other Exploit', 'Both Sexual Exploit and Forced Labour']):
y_binary = (y_train == exploit_type).astype(int)
plot_partial_dependence_for_exploit_type(X_train_te, y_binary, title, all_columns, target_encoder, df)
class sklearn.preprocessing.LabelEncoder: Encode target labels with value between 0 and n_classes-1. This transformer should be used to encode target values, i.e. y, and not the input X.