Sklearn学习笔记

"Sklearn学习笔记"

Posted by jhljx on January 28, 2018

目录

1. Python2向文件中写入Unicode字符

一个库statsmodels

import statsmodels.api as sm

y = df[‘sepal length’][:50] x = df[‘sepal width’][:50] X = sm.add_constant(x)

results = sm.OLS(y, X).fit() print(results.summary())

Sklearn Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split  #这个模块会打乱数据的先后顺序

clf = RandomForestClassifier(max_depth=5, n_estimators=10)

X = df.ix[:,:4]
y = df.ix[:,4]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

rf = pd.DataFrame(list(zip(y_pred, y_test)), columns=['predicted', 'actual'])
rf['correct'] = rf.apply(lambda r: 1 if r['predicted'] == r['actual'] else 0, axis=1)
    
rf

f_importances = clf.feature_importances_  #随机森林的这个方法返回特征在决策树中划分叶子节点的相对能力,这些数字总和为1
f_names = df.columns[:4]
f_std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)

zz = zip(f_importances, f_names, f_std)
zzs = sorted(zz, key=lambda x: x[0], reverse=True)

imps = [x[0] for x in zzs]
labels = [x[1] for x in zzs]
errs = [x[2] for x in zzs]

plt.bar(range(len(f_importances)), imps, color="r", yerr=errs, align="center")
plt.xticks(range(len(f_importances)), labels);
#花瓣的长度和宽度对于区分iris而言具有很好的辨别力

pasty库搭配statsmodels库使用,pasty库让我们可以使用R风格的公式。

使用基于密度的控件聚类算法来完成聚类(DBSCAN)。

X = StandardScaler().fit_transform(ff)
db = DBSCAN(eps=.5, min_samples=1).fit(X)

labels = db.labels_
clusters = len(set(labels))
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0,
                        1, len(unique_labels)))

plt.subplots(figsize=(12,8))

for k, c in zip(unique_labels, colors):
    class_member_mask = (labels == k)
    xy = X[class_member_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o',
             markerfacecolor=c,
             markeredgecolor='k',
             markersize=14)

plt.title("Total Clusters: {}".format(clusters),
          fontsize=14, y=1.01)

自然语言处理中的一个小算法: 词频-逆文档频率(tf-idf) tf-idf的值是词频*log(文档总数/出现该词的文档数)

from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC vect = TfidfVectorizer(ngram_range=(1,3), stop_words=’english’, min_df=3) tv = vect.fit_transform(df[‘text’]) clf = LinearSVC() model = clf.fit(tv, df[‘wanted’])

tvcomb = vect.fit_transform(combined[‘text’], combined[‘wanted’]) model = clf.fit(tvcomb, combined[‘wanted’])

KMeans算法:

from sklearn.cluster import KMeans clf = KMeans(n_clusters=16) clf.fit(dfc[[‘reds’, ‘greens’, ‘blues’]].dropna()) clusters = pd.DataFrame(clf.cluster_centers_, columns=[‘r’, ‘g’, ‘b’]) clusters.style.apply(color_cells, subset=[‘color’], axis=0)

from sklearn.ensemble import RandomForestRegressor clf = RandomForestRegressor(n_estimators=100) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_actual = y_test deltas = pd.DataFrame(list(zip(y_pred, y_actual, (y_pred - y_actual)/(y_actual))), columns=[‘predicted’, ‘actual’, ‘delta’])

from sklearn.feature_extraction.text import CountVectorizer vect = CountVectorizer(ngram_range=(1,3)) X_titles_all = vect.fit_transform(all_data[‘title’]) X_titles_train = X_titles_all[train_index] X_titles_test = X_titles_all[test_index]

from sklearn.svm import SVR clf = SVR(kernel=’linear’) X_train = sp20[:-2000] y_train = sp20[‘Close’].shift(-1)[:-2000] model = clf.fit(X_train, y_train) preds = model.predict(X_test)

from sklearn import datasets import matplotlib.pyplot as plt import numpy as np

digits = datasets.load_digits() def display_img(img_no): fig, ax = plt.subplots() ax.set_xticklabels([]) ax.set_yticklabels([]) ax.matshow(digits.images[img_no], cmap = plt.cm.binary); display_img(0) digits.images[0]

import pandas as pd from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import chi2_kernel X = digits.data y = digits.target k_sim = chi2_kernel(X[0].reshape(1,-1), X) kf = pd.DataFrame(k_sim).T kf.columns = [‘similarity’] kf.sort_values(‘similarity’, ascending=False)

co_sim = cosine_similarity(X[0].reshape(1,-1), X)

import graphlab graphlab.canvas.set_target(‘ipynb’) gl_img = graphlab.SFrame(‘http://s3.amazonaws.com/dato-datasets/coursera/deep_learning/image_train_data’) gl_img[‘image’][0:5].show() graphlab.image_analysis.resize(gl_img[‘image’][2:3], 96,96).show() img = graphlab.Image(‘/Users/alexcombs/Downloads/profile_pic.jpg’) ppsf = graphlab.SArray([img]) ppsf = graphlab.image_analysis.resize(ppsf, 32,32) ppsf.show() ppsf = graphlab.SFrame(ppsf).rename({‘X1’: ‘image’}) deep_learning_model = graphlab.load_model(‘http://s3.amazonaws.com/GraphLab-Datasets/deeplearning/imagenet_model_iter45’) ppsf[‘deep_features’] = deep_learning_model.extract_features(ppsf) ppsf[‘label’] = ‘me’ gl_img[‘id’].max() ppsf[‘id’] = 50000

labels = [‘id’, ‘image’, ‘label’, ‘deep_features’] part_train = gl_img[labels] new_train = part_train.append(ppsf[labels]) new_train.tail() knn_model = graphlab.nearest_neighbors.create(new_train,features=[‘deep_features’], label=’id’) cat_test = new_train[145:146] graphlab.image_analysis.resize(cat_test[‘image’], 96,96).show() sim_frame = knn_model.query(cat_test) def reveal_my_twin(x): return gl_img.filter_by(x[‘reference_label’],’id’)

spirit_animal = reveal_my_twin(knn_model.query(cat_test)) spirit_animal[‘image’].show() me_test = new_train[-1:] graphlab.image_analysis.resize(me_test[‘image’], 96,96).show() sim_frame = knn_model.query(me_test) def reveal_my_twin(x): return gl_img.filter_by(x[‘reference_label’],’id’)

spirit_animal = reveal_my_twin(knn_model.query(me_test)) graphlab.image_analysis.resize(spirit_animal[‘image’][0:1], 96,96).show()

from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity vectorizer = TfidfVectorizer(ngram_range=(1,3)) vec = vectorizer.fit_transform(convo_frame[‘q’]) my_q = vectorizer.transform([‘Hi. My name is Alex.’]) cs = cosine_similarity(my_q, vec) rs = pd.Series(cs[0]).sort_values(ascending=0) top5 = rs.iloc[0:5] convo_frame.iloc[top5.index][‘q’] rsi = rs.index[0] convo_frame.iloc[rsi][‘a’] def get_response(q): my_q = vectorizer.transform([q]) cs = cosine_similarity(my_q, vec) rs = pd.Series(cs[0]).sort_values(ascending=0) rsi = rs.index[0] return convo_frame.iloc[rsi][‘a’] get_response(‘Yes, I am clearly more clever than you will ever be!’)

from sklearn.metrics import jaccard_similarity_score from scipy.stats import pearsonr sim_score = {} for i in range(len(fdf)): ss = pearsonr(fdf.iloc[-1,:], fdf.iloc[i,:]) sim_score.update({i: ss[0]}) sf = pd.Series(sim_score).to_frame(‘similarity’) sf.sort_values(‘similarity’, ascending=False)

clf = linear_model.LogisticRegression() clf.fit(X_train, y_train) clf.score(X_test, y_test) pred_label = clf.predict(X_test)