網(wǎng)站首頁編程語言正文

Python?Pandas數(shù)據(jù)處理高頻操作詳解_python

作者：數(shù)據(jù)不吹牛 ? 更新時間： 2022-08-23 編程語言

引入依賴
算法相關(guān)依賴
獲取數(shù)據(jù)
生成df
重命名列
增加列
缺失值處理
獨熱編碼
替換值
刪除列
數(shù)據(jù)篩選
差值計算
數(shù)據(jù)修改
時間格式轉(zhuǎn)換
設(shè)置索引列
折線圖
散點圖
柱狀圖
熱力圖
66個最常用的pandas數(shù)據(jù)分析函數(shù)

從各種不同的來源和格式導(dǎo)入數(shù)據(jù)
導(dǎo)出數(shù)據(jù)
創(chuàng)建測試對象
查看、檢查數(shù)據(jù)
數(shù)據(jù)選取
數(shù)據(jù)清理
篩選，排序和分組依據(jù)
數(shù)據(jù)合并
數(shù)據(jù)統(tǒng)計

16個函數(shù)，用于數(shù)據(jù)清洗

1.cat函數(shù)
2.contains
3.startswith/endswith
4.count
5.get
6.len
7.upper/lower
8.pad+side參數(shù)/center
9.repeat
10.slice_replace
11.replace
12.replace
13.split方法+expand參數(shù)
14.strip/rstrip/lstrip
15.findall
16.extract/extractall

引入依賴

#?導(dǎo)入模塊
import?pymysql
import?pandas?as?pd
import?numpy?as?np
import?time

#?數(shù)據(jù)庫
from?sqlalchemy?import?create_engine

#?可視化
import?matplotlib.pyplot?as?plt
#?如果你的設(shè)備是配備Retina屏幕的mac，可以在jupyter?notebook中，使用下面一行代碼有效提高圖像畫質(zhì)
%config?InlineBackend.figure_format?=?'retina'
#?解決?plt?中文顯示的問題?mymac
plt.rcParams['font.sans-serif']?=?['Arial?Unicode?MS']
#?設(shè)置顯示中文?需要先安裝字體?aistudio
plt.rcParams['font.sans-serif']?=?['SimHei']?#?指定默認字體
plt.rcParams['axes.unicode_minus']?=?False??#?用來正常顯示負號
import?seaborn?as?sns
#?notebook渲染圖片
%matplotlib?inline
import?pyecharts

#?忽略版本問題
import?warnings
warnings.filterwarnings("ignore")??

#?下載中文字體
!wget?https://mydueros.cdn.bcebos.com/font/simhei.ttf?
#?將字體文件復(fù)制到?matplotlib'字體路徑
!cp?simhei.ttf?/opt/conda/envs/python35-paddle120-env/Lib/python3,7/site-packages/matplotib/mpl-data/fonts.

#?一般只需要將字體文件復(fù)制到系統(tǒng)字體田錄下即可,但是在?studio上該路徑?jīng)]有寫權(quán)限,所以此方法不能用?
#?!cp?simhei.?ttf?/usr/share/fonts/

#?創(chuàng)建系統(tǒng)字體文件路徑
!mkdir?.fonts
#?復(fù)制文件到該路徑
!cp?simhei.ttf?.fonts/
!rm?-rf?.cache/matplotlib

算法相關(guān)依賴

#?數(shù)據(jù)歸一化
from?sklearn.preprocessing?import?MinMaxScaler

#?kmeans聚類
from?sklearn.cluster?import?KMeans
#?DBSCAN聚類
from?sklearn.cluster?import?DBSCAN
#?線性回歸算法
from?sklearn.linear_model?import?LinearRegression
#?邏輯回歸算法
from?sklearn.linear_model?import?LogisticRegression
#?高斯貝葉斯
from?sklearn.naive_bayes?import?GaussianNB
#?劃分訓(xùn)練/測試集
from?sklearn.model_selection?import?train_test_split
#?準確度報告
from?sklearn?import?metrics
#?矩陣報告和均方誤差
from?sklearn.metrics?import?classification_report,?mean_squared_error

獲取數(shù)據(jù)

from?sqlalchemy?import?create_engine
engine?=?create_engine('mysql+pymysql://root:root@127.0.0.1:3306/ry?charset=utf8')

#?查詢插入后相關(guān)表名及行數(shù)
result_query_sql?=?"use?information_schema;"
engine.execute(result_query_sql)
result_query_sql?=?"SELECT?table_name,table_rows?FROM?tables?WHERE?TABLE_NAME?LIKE?'log%%'?order?by?table_rows?desc;"
df_result?=?pd.read_sql(result_query_sql,?engine)

生成df

#?list轉(zhuǎn)df
df_result?=?pd.DataFrame(pred,columns=['pred'])
df_result['actual']?=?test_target
df_result

#?df取子df
df_new?=?df_old[['col1','col2']]

#?dict生成df
df_test?=?pd.DataFrame({<!--?-->'A':[0.587221,?0.135673,?0.135673,?0.135673,?0.135673],?
????????????????????????'B':['a',?'b',?'c',?'d',?'e'],
????????????????????????'C':[1,?2,?3,?4,?5]})

#?指定列名
data?=?pd.DataFrame(dataset.data,?columns=dataset.feature_names)

#?使用numpy生成20個指定分布(如標準正態(tài)分布)的數(shù)
tem?=?np.random.normal(0,?1,?20)
df3?=?pd.DataFrame(tem)

#?生成一個和df長度相同的隨機數(shù)dataframe
df1?=?pd.DataFrame(pd.Series(np.random.randint(1,?10,?135)))

重命名列

#?重命名列
data_scaled?=?data_scaled.rename(columns={<!--?-->'本體油位':?'OILLV'})

增加列

#?df2df
df_jj2yyb['r_time']?=?pd.to_datetime(df_jj2yyb['cTime'])

#?新增一列根據(jù)salary將數(shù)據(jù)分為3組
bins?=?[0,5000,?20000,?50000]
group_names?=?['低',?'中',?'高']
df['categories']?=?pd.cut(df['salary'],?bins,?labels=group_names)

缺失值處理

#?檢查數(shù)據(jù)中是否含有任何缺失值
df.isnull().values.any()

#?查看每列數(shù)據(jù)缺失值情況
df.isnull().sum()

#?提取某列含有空值的行
df[df['日期'].isnull()]

#?輸出每列缺失值具體行數(shù)
for?i?in?df.columns:
????if?df[i].count()?!=?len(df):
????????row?=?df[i][df[i].isnull().values].index.tolist()
????????print('列名："{}", 第{}行位置有缺失值'.format(i,row))

#?眾數(shù)填充
heart_df['Thal'].fillna(heart_df['Thal'].mode(dropna=True)[0],?inplace=True)

#?連續(xù)值列的空值用平均值填充
dfcolumns?=?heart_df_encoded.columns.values.tolist()
for?item?in?dfcolumns:
????if?heart_df_encoded[item].dtype?==?'float':
???????heart_df_encoded[item].fillna(heart_df_encoded[item].median(),?inplace=True)

獨熱編碼

df_encoded?=?pd.get_dummies(df_data)

替換值

#?按列值替換
num_encode?=?{<!--?-->
????'AHD':?{<!--?-->'No':0,?"Yes":1},
}
heart_df.replace(num_encode,inplace=True)

刪除列

df_jj2.drop(['coll_time',?'polar',?'conn_type',?'phase',?'id',?'Unnamed:?0'],axis=1,inplace=True)

數(shù)據(jù)篩選

#?取第33行數(shù)據(jù)
df.iloc[32]

#?某列以xxx字符串開頭
df_jj2?=?df_512.loc[df_512["transformer"].str.startswith('JJ2')]

df_jj2yya?=?df_jj2.loc[df_jj2["變壓器編號"]=='JJ2YYA']

#?提取第一列中不在第二列出現(xiàn)的數(shù)字
df['col1'][~df['col1'].isin(df['col2'])]

#?查找兩列值相等的行號
np.where(df.secondType?==?df.thirdType)

#?包含字符串
results?=?df['grammer'].str.contains("Python")

#?提取列名
df.columns

#?查看某列唯一值（種類）
df['education'].nunique()

#?刪除重復(fù)數(shù)據(jù)
df.drop_duplicates(inplace=True)

#?某列等于某值
df[df.col_name==0.587221]
#?df.col_name==0.587221?各行判斷結(jié)果返回值(True/False)

#?查看某列唯一值及計數(shù)
df_jj2["變壓器編號"].value_counts()

#?時間段篩選
df_jj2yyb_0501_0701?=?df_jj2yyb[(df_jj2yyb['r_time']?&gt;=pd.to_datetime('20200501'))?&amp;?(df_jj2yyb['r_time']?&lt;=?pd.to_datetime('20200701'))]

#?數(shù)值篩選
df[(df['popularity']?&gt;?3)?&amp;?(df['popularity']?&lt;?7)]

#?某列字符串截取
df['Time'].str[0:8]

#?隨機取num行
ins_1?=?df.sample(n=num)

#?數(shù)據(jù)去重
df.drop_duplicates(['grammer'])

#?按某列排序(降序)
df.sort_values("popularity",inplace=True,?ascending=False)

#?取某列最大值所在行
df[df['popularity']?==?df['popularity'].max()]

#?取某列最大num行
df.nlargest(num,'col_name')
#?最大num列畫橫向柱形圖
df.nlargest(10).plot(kind='barh')

差值計算

# axis=0或index表示上下移動， periods表示移動的次數(shù)，為正時向下移，為負時向上移動。
print(df.diff(?periods=1,?axis=‘index‘))
print(df.diff(?periods=-1,?axis=0))
# axis=1或columns表示左右移動，periods表示移動的次數(shù)，為正時向右移，為負時向左移動。
print(df.diff(?periods=1,?axis=‘columns‘))
print(df.diff(?periods=-1,?axis=1))

#?變化率計算
data['收盤價(元)'].pct_change()

#?以5個數(shù)據(jù)作為一個數(shù)據(jù)滑動窗口，在這個5個數(shù)據(jù)上取均值
df['收盤價(元)'].rolling(5).mean()

數(shù)據(jù)修改

#?刪除最后一行
df?=?df.drop(labels=df.shape[0]-1)

#?添加一行數(shù)據(jù)['Perl',6.6]
row?=?{<!--?-->'grammer':'Perl','popularity':6.6}
df?=?df.append(row,ignore_index=True)

#?某列小數(shù)轉(zhuǎn)百分數(shù)
df.style.format({<!--?-->'data':?'{0:.2%}'.format})

#?反轉(zhuǎn)行
df.iloc[::-1,?:]

#?以兩列制作數(shù)據(jù)透視
pd.pivot_table(df,values=["salary","score"],index="positionId")

#?同時對兩列進行計算
df[["salary","score"]].agg([np.sum,np.mean,np.min])

#?對不同列執(zhí)行不同的計算
df.agg({<!--?-->"salary":np.sum,"score":np.mean})

時間格式轉(zhuǎn)換

#?時間戳轉(zhuǎn)時間字符串
df_jj2['cTime']?=df_jj2['coll_time'].apply(lambda?x:?time.strftime("%Y-%m-%d?%H:%M:%S",?time.localtime(x)))

#?時間字符串轉(zhuǎn)時間格式
df_jj2yyb['r_time']?=?pd.to_datetime(df_jj2yyb['cTime'])

#?時間格式轉(zhuǎn)時間戳
dtime?=?pd.to_datetime(df_jj2yyb['r_time'])
v?=?(dtime.values?-?np.datetime64('1970-01-01T08:00:00Z'))?/?np.timedelta64(1,?'ms')
df_jj2yyb['timestamp']?=?v

設(shè)置索引列

df_jj2yyb_small_noise?=?df_jj2yyb_small_noise.set_index('timestamp')

折線圖

fig,?ax?=?plt.subplots()
df.plot(legend=True,?ax=ax)
plt.legend(loc=1)
plt.show()

plt.figure(figsize=(20,?6))
plt.plot(max_iter_list,?accuracy,?color='red',?marker='o',
?????????markersize=10)
plt.title('Accuracy?Vs?max_iter?Value')
plt.xlabel('max_iter?Value')
plt.ylabel('Accuracy')

散點圖

plt.scatter(df[:,?0],?df[:,?1],?c="red",?marker='o',?label='lable0')???
plt.xlabel('x')??
plt.ylabel('y')??
plt.legend(loc=2)??
plt.show()??

柱狀圖

df?=?pd.Series(tree.feature_importances_,?index=data.columns)
#?取某列最大Num行畫橫向柱形圖
df.nlargest(10).plot(kind='barh')

熱力圖

df_corr?=?combine.corr()
plt.figure(figsize=(20,20))
g=sns.heatmap(df_corr,annot=True,cmap="RdYlGn")

66個最常用的pandas數(shù)據(jù)分析函數(shù)

df?#任何pandas?DataFrame對象?
s?#任何pandas?series對象

從各種不同的來源和格式導(dǎo)入數(shù)據(jù)

pd.read_csv(filename)?#?從CSV文件?
pd.read_table(filename)?#?從分隔的文本文件（例如CSV）中?
pd.read_excel(filename)?#?從Excel文件?
pd.read_sql(query,?connection_object)?#?從SQL表/數(shù)據(jù)庫中讀取?
pd.read_json(json_string)?#?從JSON格式的字符串，URL或文件中讀取。
pd.read_html(url)?#?解析html?URL，字符串或文件，并將表提取到數(shù)據(jù)幀列表?
pd.read_clipboard()?#?獲取剪貼板的內(nèi)容并將其傳遞給?read_table()?
pd.DataFrame(dict)?#?從字典中，列名稱的鍵，列表中的數(shù)據(jù)的值

導(dǎo)出數(shù)據(jù)

df.to_csv(filename)?#?寫入CSV文件?
df.to_excel(filename)?#?寫入Excel文件?
df.to_sql(table_name,?connection_object)?#?寫入SQL表?
df.to_json(filename)?#?以JSON格式寫入文件

創(chuàng)建測試對象

pd.DataFrame(np.random.rand(20,5))???????????????#?5列20行隨機浮點數(shù)?pd.Series(my_list)???????????????????????????????#?從一個可迭代的序列創(chuàng)建一個序列?my_list?
df.index?=?pd.date_range('1900/1/30',?periods=df.shape[0])?#?添加日期索引

查看、檢查數(shù)據(jù)

df.head(n)???????????????????????#?DataFrame的前n行?
df.tail(n)???????????????????????#?DataFrame的最后n行?
df.shape?????????????????????????#?行數(shù)和列數(shù)?
df.info()????????????????????????#?索引，數(shù)據(jù)類型和內(nèi)存信息?
df.describe()????????????????????#?數(shù)值列的摘要統(tǒng)計信息?
s.value_counts(dropna=False)?????#?查看唯一值和計數(shù)?
df.apply(pd.Series.value_counts)?#?所有列的唯一值和計數(shù)

數(shù)據(jù)選取

使用這些命令選擇數(shù)據(jù)的特定子集。
df[col]???????????????#?返回帶有標簽col的列?
df[[col1,?col2]]??????#?返回列作為新的DataFrame?
s.iloc[0]?????????????#?按位置選擇?
s.loc['index_one']????#?按索引選擇?
df.iloc[0,:]??????????#?第一行?
df.iloc[0,0]??????????#?第一欄的第一元素

數(shù)據(jù)清理

df.columns?=?['a','b','c']??????????????????#?重命名列?
pd.isnull()?????????????????????????????????#?空值檢查，返回Boolean?Arrray?
pd.notnull()????????????????????????????????#?與pd.isnull()?相反?
df.dropna()?????????????????????????????????#?刪除所有包含空值的行?
df.dropna(axis=1)???????????????????????????#?刪除所有包含空值的列?
df.dropna(axis=1,thresh=n)??????????????????#?刪除所有具有少于n個非null值的行?
df.fillna(x)????????????????????????????????#?將所有空值替換為x?
s.fillna(s.mean())??????????????????????????#?用均值替換所有空值（均值可以用統(tǒng)計模塊中的幾乎所有函數(shù)替換?）?
s.astype(float)?????????????????????????????#?將系列的數(shù)據(jù)類型轉(zhuǎn)換為float?
s.replace(1,'one')??????????????????????????#?1?用?'one'?
s.replace([1,3],['one','three'])????????????#?替換所有等于的值?替換為所有1?'one'?，并?3?用?'three'?df.rename(columns=lambda?x:?x?+?1)??????????#?列的重命名?
df.rename(columns={<!--?-->'old_name':?'new_?name'})#?選擇性重命名?
df.set_index('column_one')??????????????????#?更改索引?
df.rename(index=lambda?x:?x?+?1)????????????#?大規(guī)模重命名索引

篩選，排序和分組依據(jù)

df[df[col]?&gt;?0.5]??????????????????????#?列?col?大于?0.5?df[(df[col]?&gt;?0.5)?&amp;?(df[col]?&lt;?0.7)]??#?小于?0.7?大于0.5的行?
df.sort_values(col1)???????????????????#?按col1升序?qū)χ颠M行排序?
df.sort_values(col2,ascending=False)???#?按col2?降序?qū)χ颠M行?排序?
df.sort_values([col1,col2],ascending=[True,False])?#按?col1?升序排序，然后?col2?按降序排序?
df.groupby(col)????????????????????????#從一個欄返回GROUPBY對象?
df.groupby([col1,col2])?#?返回來自多個列的groupby對象?
df.groupby(col1)[col2]?????????????????#?返回中的值的平均值?col2，按中的值分組?col1?（平均值可以用統(tǒng)計模塊中的幾乎所有函數(shù)替換?）?
df.pivot_table(index=col1,values=[col2,col3],aggfunc=mean)?#?創(chuàng)建一個數(shù)據(jù)透視表組通過?col1?，并計算平均值的?col2?和?col3?
df.groupby(col1).agg(np.mean)??????????#?在所有列中找到每個唯一col1?組的平均值?
df.apply(np.mean)??????????????????????#np.mean()?在每列上應(yīng)用該函數(shù)?
df.apply(np.max,axis=1)????????????????#?np.max()?在每行上應(yīng)用功能

數(shù)據(jù)合并

df1.append(df2)???????????????????#?將df2添加?df1的末尾?（各列應(yīng)相同）?
pd.concat([df1,?df2],axis=1)??????#?將?df1的列添加到df2的末尾?（行應(yīng)相同）?
df1.join(df2,on=col1,how='inner')?# SQL樣式將列 df1 與 df2 行所在的列col 具有相同值的列連接起來。'how'可以是一個?'left'，?'right'，?'outer'，?'inner'

數(shù)據(jù)統(tǒng)計

df.describe()????#?數(shù)值列的摘要統(tǒng)計信息?
df.mean()????????#?返回均值的所有列?
df.corr()????????#?返回DataFrame中各列之間的相關(guān)性?
df.count()???????#?返回非空值的每個數(shù)據(jù)幀列中的數(shù)字?
df.max()?????????#?返回每列中的最高值?
df.min()?????????#?返回每一列中的最小值?
df.median()??????#?返回每列的中位數(shù)?
df.std()?????????#?返回每列的標準偏差

16個函數(shù)，用于數(shù)據(jù)清洗

#?導(dǎo)入數(shù)據(jù)集
import?pandas?as?pd

df?={<!--?-->'姓名':['?黃同學(xué)','黃至尊','黃老邪?','陳大美','孫尚香'],
?????'英文名':['Huang?tong_xue','huang?zhi_zun','Huang?Lao_xie','Chen?Da_mei','sun?shang_xiang'],
?????'性別':['男','women','men','女','男'],
?????'身份證':['463895200003128433','429475199912122345','420934199110102311','431085200005230122','420953199509082345'],
?????'身高':['mid:175_good','low:165_bad','low:159_bad','high:180_verygood','low:172_bad'],
?????'家庭住址':['湖北廣水','河南信陽','廣西桂林','湖北孝感','廣東廣州'],
?????'電話號碼':['13434813546','19748672895','16728613064','14561586431','19384683910'],
?????'收入':['1.1萬','8.5千','0.9萬','6.5千','2.0萬']}
df?=?pd.DataFrame(df)
df

1.cat函數(shù)

用于字符串的拼接

df["姓名"].str.cat(df["家庭住址"],sep='-'*3)

2.contains

判斷某個字符串是否包含給定字符

df["家庭住址"].str.contains("廣")

3.startswith/endswith

判斷某個字符串是否以…開頭/結(jié)尾

#?第一個行的“?黃偉”是以空格開頭的
df["姓名"].str.startswith("黃")?
df["英文名"].str.endswith("e")

4.count

計算給定字符在字符串中出現(xiàn)的次數(shù)

df["電話號碼"].str.count("3")

5.get

獲取指定位置的字符串

df["姓名"].str.get(-1)
df["身高"].str.split(":")
df["身高"].str.split(":").str.get(0)

6.len

計算字符串長度

df["性別"].str.len()

7.upper/lower

英文大小寫轉(zhuǎn)換

df["英文名"].str.upper()
df["英文名"].str.lower()

8.pad+side參數(shù)/center

在字符串的左邊、右邊或左右兩邊添加給定字符

df["家庭住址"].str.pad(10,fillchar="*")??????#?相當(dāng)于ljust()
df["家庭住址"].str.pad(10,side="right",fillchar="*")????#?相當(dāng)于rjust()
df["家庭住址"].str.center(10,fillchar="*")

9.repeat

重復(fù)字符串幾次

df["性別"].str.repeat(3)

10.slice_replace

使用給定的字符串，替換指定的位置的字符

df["電話號碼"].str.slice_replace(4,8,"*"*4)

11.replace

將指定位置的字符，替換為給定的字符串

df["身高"].str.replace(":","-")

12.replace

將指定位置的字符，替換為給定的字符串(接受正則表達式)

replace中傳入正則表達式，才叫好用；- 先不要管下面這個案例有沒有用，你只需要知道，使用正則做數(shù)據(jù)清洗多好用；

df["收入"].str.replace("\d+\.\d+","正則")

13.split方法+expand參數(shù)

搭配join方法功能很強大

#?普通用法
df["身高"].str.split(":")
#?split方法，搭配expand參數(shù)
df[["身高描述","final身高"]]?=?df["身高"].str.split(":",expand=True)
df
#?split方法搭配join方法
df["身高"].str.split(":").str.join("?"*5)

14.strip/rstrip/lstrip

去除空白符、換行符

df["姓名"].str.len()
df["姓名"]?=?df["姓名"].str.strip()
df["姓名"].str.len()

15.findall

利用正則表達式，去字符串中匹配，返回查找結(jié)果的列表

findall使用正則表達式，做數(shù)據(jù)清洗，真的很香！

df["身高"]
df["身高"].str.findall("[a-zA-Z]+")

16.extract/extractall

接受正則表達式，抽取匹配的字符串(一定要加上括號)

df["身高"].str.extract("([a-zA-Z]+)")
#?extractall提取得到復(fù)合索引
df["身高"].str.extractall("([a-zA-Z]+)")
#?extract搭配expand參數(shù)
df["身高"].str.extract("([a-zA-Z]+).*?([a-zA-Z]+)",expand=True

原文鏈接：https://mp.weixin.qq.com/s/4eFMuHmSF3N8I9h1WtHjQg

日本免费高清视频-国产福利视频导航-黄色在线播放国产-天天操天天操天天操天天操|www.shdianci.com

網(wǎng)站首頁 編程語言 正文