def get_dataset(dataset_name: str, path="../data/"):
    """
    Download datasets from Google Drive.
    """
    name_to_id = {
        "word2vec-google-news-300.pkl": "1dRwSXbFTcQbn8c3V24G92wFY4DXZ1SDt",
        "imdb.csv": "1wF0YEmQOwceJz2d6w4CfhBgydU87dPGl",
        "housing.csv": "1d7oOKdDmZFx8wf0c8OfuTW1FpUyJHABh",
        "housing_gmaps_data_raw.csv": "1R1RUHAXxzrIngRJMFwyp4vZRVICd-I6T",
        "housing_addresses.csv": "1mOK0uyRz5Zs-Qo7mVMlxwtb2xn1E6N9Q",
        "housing_merged.csv": "1bdYuBtIPrKiU-ut2MeSSsL47onPtZrRt",
        "housing_processed.csv": "12PxnWhPg_Pj0yx75vD22gwfdkkx80E6_",
        "churn.csv": "1-IO-JQr7tjQGIKZyo_SyupCpX2VNDQIf",
        "kaggle_housing_train.csv": "1BHiuZyMab7rPA8Rog29fIYhJmjvJLkVI",
        "kaggle_housing_test.csv": "1KSfBhIdFlejUWAnrfFl10c-rjA4VhgkT",
        "kaggle_titanic_train.csv": "1BHiuZyMab7rPA8Rog29fIYhJmjvJLkVI",
        "kaggle_titanic_test.csv": "1NFCDTBF4dM8rllv0fP3VnPmoRLmfdOEB",
        "fine_tuned.pth": "S3",
        "data_lm.pkl": "S3"
    }
    os.makedirs(path, exist_ok=True)
    gdrive_path = "https://docs.google.com/uc?export=download&id="
    s3_path = "https://dslectures.s3.eu-central-1.amazonaws.com/"
    
    if dataset_name in name_to_id:
        if os.path.exists(path + dataset_name):
            print(
                f"Dataset already exists at '{path + dataset_name}' and is not downloaded again."
            )
            return
        try:
            if name_to_id[dataset_name]=="S3":
                file_url = s3_path + dataset_name
            else:
                file_url = gdrive_path + name_to_id[dataset_name]
            wget.download(file_url, out=path)
        except Exception as e:
            print("Something went wrong during download. Try again.")
            raise e
        print(f"Download of {dataset_name} dataset complete.")
    else:
        raise KeyError("File not on Google Drive.")
California Housing Prices
This dataset from Kaggle (link) is used in the second chapter of Aurélien Géron's recent book Hands-On Machine learning with Scikit-Learn and TensorFlow.
get_dataset("housing.csv")
get_dataset("housing_gmaps_data_raw.csv")
get_dataset("housing_addresses.csv")
The merge of housing.csv and housing_addresses.csv from lesson 2.
get_dataset("housing_merged.csv")
The processed version of housing_merged.csv with no missing values and categorical columns encoded numerically.
get_dataset("housing_processed.csv")
imdb.csv 
The IMDB dataset is available on Kaggle (link). This is a dataset for binary sentiment classification and provides a set of 25,000 highly polar movie reviews for training and 25,000 for testing.
get_dataset("imdb.csv")
word2vec-google-news-300.pkl 
Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in Distributed Representations of Words and Phrases and their Compositionality. This dataset is available from GENSIM (link).
get_dataset("word2vec-google-news-300.pkl")
IBM's telecommunications dataset on customer churn. The dataset includes information about:
- Customers who left within the last month – the column is called Churn
- Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
- Customer account information – how long they’ve been a customer (tenure), contract, payment method, paperless billing, monthly charges, and total charges
- Demographic info about customers – gender, whether they're a senior citizen or not, and if they have partners and dependents
get_dataset("churn.csv")
Train and test data for Kaggle's regression challenge (link here).
get_dataset("kaggle_housing_train.csv")
get_dataset("kaggle_housing_test.csv")
Train and test data for Kaggle's Titanic challenge (link here).
get_dataset("kaggle_titanic_train.csv")
get_dataset("kaggle_titanic_test.csv")
get_dataset("fine_tuned.pth")
def rmse(y, yhat):
    """A utility function to calculate the Root Mean Square Error (RMSE).
    
    Args:
        y (array): Actual values for target.
        yhat (array): Predicted values for target.
        
    Returns:
        rmse (double): The RMSE.
    """
    return np.sqrt(mean_squared_error(y, yhat))
y = np.array([2, 2, 3])
yhat = np.array([0, 2, 6])
rmse(y, yhat)
def convert_strings_to_categories(df):
    """A utility function to convert all string columns to Categorical data type."""
    for col in df.columns:
        if is_object_dtype(df[col]):
            df[col] = df[col].astype("category")
df = pd.DataFrame(
    {"string_column": ["apple", "banana", "orange"], "numerical_column": [0, 2, 4]}
)
df.head()
df.dtypes
convert_strings_to_categories(df)
df.dtypes
df["string_column"].cat.categories
df["string_column"].cat.codes
def fill_missing_values_with_median(df):
    """Replaces missing values in numerical columns with the median."""
    for column in df.columns:
        if is_numeric_dtype(df[column]):
            if pd.isnull(df[column]).sum():
                column_median = df[column].median()
                df[column].fillna(column_median, inplace=True)
df = pd.DataFrame(
    [[np.nan, 2, 0], [3, 4, 1], [np.nan, np.nan, 5], [np.nan, 3, 4],],
    columns=list("ABC"),
)
df
fill_missing_values_with_median(df)
df
def make_polynomial_data(weight, n_samples=100, seed=42):
    """
    Creates noisy polynomial data.
    
    Args:
        weight (array): polynomial weights in descending order
        n_samples (int): number of samples
        seed (int): random seed
    
    returns:
        x (array): x-values
        y (array): y-values
    """
    np.random.seed(seed)
    # generate random points on the x axis
    x = (0.5-np.random.rand(n_samples))*2
    
    # sort the array
    x = np.sort(x)
    
    # evalute polynomial with weight w at positions x
    y_true = np.polyval(weight, x)
    # add noise samples from the normal gaussian
    # distribution to the data.
    y = y_true + np.random.randn(n_samples)
    
    return x, y
x, y = make_polynomial_data(np.array([10, 0, 0]))
plt.plot(x, y)
class PolynomialRegressor:
    """
    Scikit-like interface to fit polynomials.
    """
    def __init__(self, degree=0, w_hat=None):
        """
        Initialize polynomial fitter.
        
        args:
            degree (int): degree of polynomial to fit, default: 0.
            w_hat (array): polynomial weights, default: None.
        
        """
        self.degree = degree
        self.w_hat = w_hat
    def fit(self, x, y):
        """Fit polynomial to x,y data."""
        self.w_hat = np.polyfit(x, y, self.degree)
        return self
        
    def predict(self, x):
        """Predict y with fitted polynomial."""
        if self.w_hat is not None:
            return np.polyval(self.w_hat, x)
        else:
            raise ValueError('You need to first fit the model.')
    
    def evaluate(self, x, y):
        """Evaluate RMSE score of y and predictions for y."""
        if self.w_hat is not None:
            y_hat = np.polyval(self.w_hat, x)
            return rmse(y, y_hat)
        else:
            raise ValueError('You need to first fit the model.')
            
    def get_params(self, **kwargs):
        return {'w_hat': self.w_hat,
                'degree': self.degree}
            
pf = PolynomialRegressor(degree=2)
pf.fit(x, y)
x_lin = np.linspace(-1, 1, 100)
y_hat =pf.predict(x_lin)
plt.scatter(x, y, label='data')
plt.plot(x_lin, y_hat, c='r', label='fit')
plt.plot(x_lin, np.polyval([10, 0, 0], x_lin), linestyle='--', c='black', label='ground truth')
plt.legend(loc='best')
plt.show()
print(f"RMSE: {pf.evaluate(x, y):.3f}")
def display_large(df):
    """Displays up to 1000 columns and rows of pandas.DataFrame or pandas.Series objects."""
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000):
        display(df)
data = [{"col_" + str(k): v for k, v in enumerate(range(100))}]
df = pd.DataFrame(data)
df.head()
display_large(df)
def rf_feature_importance(fitted_model, df):
    "Creates a pandas.Dataframe of a Random Forest's feature importance per column."
    return pd.DataFrame(
        {"Column": df.columns, "Importance": fitted_model.feature_importances_}
    ).sort_values("Importance", ascending=False)
def plot_feature_importance(feature_importance):
    fig, ax = plt.subplots(figsize=(12,8))
    return sns.barplot(y="Column", x="Importance", data=feature_importance, color="b")
def plot_dendogram(X):
    """Plots a dendogram to see which features are related."""
    # calculate correlation coefficient
    corr = np.round(scipy.stats.spearmanr(X).correlation, 4)
    # convert to distance matrix
    corr_condensed = hc.distance.squareform(1 - corr)
    # perform clustering
    z = hc.linkage(corr_condensed, method="average")
    # plot dendogram
    fig = plt.figure(figsize=(16, 10))
    dendrogram = hc.dendrogram(
        z, labels=X.columns, orientation="left", leaf_font_size=16
    )
    plt.show()
def plot_fitting_graph(x, metric_train, metric_valid, metric_name='metric', xlabel='x', yscale='linear'):
    """Plot fitting graph for train and validation metrics."""
    plt.plot(x, metric_train, label='train')
    plt.plot(x, metric_valid, label='valid')
    plt.yscale(yscale)
    plt.title('Fitting graph')
    plt.ylabel(metric_name)
    plt.xlabel(xlabel)
    plt.legend(loc='best')
    plt.grid(True)
def plot_classifier_boundaries(X, y, clf):
    """
    Given features X and labels y along with classifier, plot decision boundaries in two dimensions.
    
    Args:
        X: feature array of shape (n_samples, n_features)
        y: label array of shape (n_samples)
    """
    
    h = .02  # step size in the mesh
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, alpha=1)
    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())