require 'polars' # Load up our iris friends iris_df = Polars.read_csv("iris.csv") puts iris_df.describe
# Extract our variables y = iris_df['petal_width'] X = iris_df.select(['petal_length', 'sepal_length', 'sepal_width']) # Add a column of 1s for the intercept term X_with_intercept = X.with_column(Polars.lit(1).alias('intercept')) # The magic formula: β = (X'X)⁻¹X'y # Calculate X'X (X transpose multiplied by X) X_transpose_X = X_with_intercept.transpose().dot(X_with_intercept) # Calculate the inverse of X'X X_transpose_X_inverse = matrix_inverse(X_transpose_X) # Calculate X'y X_transpose_y = X_with_intercept.transpose().dot(y) # Calculate the coefficients β coefficients = X_transpose_X_inverse.dot(X_transpose_y) # Pull out the individual coefficients intercept = coefficients[-1] petal_length_coef = coefficients[0] sepal_length_coef = coefficients[1] sepal_width_coef = coefficients[2] puts "Our equation is:" puts "petal_width = #{intercept.round(4)} + #{petal_length_coef.round(4)} * petal_length + #{sepal_length_coef.round(4)} * sepal_length + #{sepal_width_coef.round(4)} * sepal_width"
# Let's predict a new iris! new_iris = { 'petal_length' => 4.5, 'sepal_length' => 6.0, 'sepal_width' => 3.0 } predicted_width = intercept + petal_length_coef * new_iris['petal_length'] + sepal_length_coef * new_iris['sepal_length'] + sepal_width_coef * new_iris['sepal_width'] puts "For an iris with petal length of 4.5cm, sepal length of 6.0cm, and sepal width of 3.0cm:" puts "Predicted petal width: #{predicted_width.round(2)} cm"
# Make predictions using our multivariate model multivariate_predictions = iris_df.apply(row -> intercept + petal_length_coef * row['petal_length'] + sepal_length_coef * row['sepal_length'] + sepal_width_coef * row['sepal_width'] ) # Calculate the errors multivariate_errors = multivariate_predictions - iris_df['petal_width'] multivariate_squared_errors = multivariate_errors ** 2 multivariate_mse = multivariate_squared_errors.mean() # Make predictions using just petal length (like in our previous article) simple_predictions = iris_df.apply(row -> simple_intercept + simple_slope * row['petal_length'] ) simple_errors = simple_predictions - iris_df['petal_width'] simple_squared_errors = simple_errors ** 2 simple_mse = simple_squared_errors.mean() puts "Mean Squared Error (Multivariate): #{multivariate_mse.round(4)}" puts "Mean Squared Error (Simple): #{simple_mse.round(4)}"
puts "Coefficients:" puts "Petal Length: #{petal_length_coef.round(4)}" puts "Sepal Length: #{sepal_length_coef.round(4)}" puts "Sepal Width: #{sepal_width_coef.round(4)}"
# Standardize our variables (subtract mean, divide by std) standardized_X = X.apply(col -> (col - col.mean()) / col.std()) # Redo the regression with standardized variables # ... (similar matrix operations as before) puts "Standardized coefficients (these we can compare!):" puts "Petal Length: #{std_petal_length_coef.round(4)}" puts "Sepal Length: #{std_sepal_length_coef.round(4)}" puts "Sepal Width: #{std_sepal_width_coef.round(4)}"