from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
# Create a Spark session
spark = SparkSession.builder.appName("VIFExample").getOrCreate()
# Assume you have a DataFrame named 'data' with features and label columns
# For example, 'features' column should be a vector of features
# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=["feature1", "feature2", "feature3"], outputCol="features")
data = assembler.transform(data)
# Calculate the correlation matrix
correlation_matrix = Correlation.corr(data, "features").head()
# Extract the correlation values
corr_values = correlation_matrix[0].toArray()
# Calculate VIF for each feature
num_features = len(corr_values)
vif_values = [1 / (1 - corr_values[i, j]**2) if i != j else 0 for i in range(num_features) for j in range(num_features)]
# Print the VIF values
print("VIF Values:")
for i in range(num_features):
print("Feature {}: {:.4f}".format(i + 1, vif_values[i]))
# Stop the Spark session
spark.stop()
300x250
반응형
'Data Science > Statistic' 카테고리의 다른 글
AIC, BIC 로 모형 적합도 탐색하기(pyspark 코드) (1) | 2024.01.02 |
---|