python数据可视化

思维导图

折线图

基本操作

绘制了折线图(plt.plot)

#导入pyplot 生成对象plt
from matplotlib import pyplot as plt
x = range(2,26,2)
y = [15,13,14.5,17,20,25,26,26,24,22,18,15]
plt.plot(x,y)
plt.show()

设置了图片的大小和分辨率(olt.figure)

1 2	#设置图片大小参数为图片宽度和高度以及每英寸的像素数（清晰度） plt.figure(figsize=(20,8),dpi=80)

实现了图片的保存(plt.savefig)

1	plt.savefig("C:/Users/Administrator/2.png")

设置了x轴上的刻度和字符串(xticks)

#设置字体为中文雅黑
import matplotlib
font = {'family': 'MicroSoft YaHei',
        'weight': 'bold',
        'size': '12'}
matplotlib.rc("font", **font)
x = range(0,120)
_x = list(x)
#_xticks_labels显示为10点i分
_xticks_labels = ["10点{}分".format(i) for i in range(60)]
_xticks_labels += ["11点{}分".format(i) for i in range(60)]
#显示刻度为_x,显示标签为_xticks_labels
#取步长，数字和字符串一一对应,字符串旋转270度
plt.xticks(_x[::3],_xticks_labels[::3],rotation=270)

解决了刻度稀疏和密集的问题(xticks)

_xticks_labels = ["10点{}分".format(i) for i in range(60)]
_xticks_labels += ["11点{}分".format(i) for i in range(60)]
#显示刻度为_x,显示标签为_xticks_labels
#取步长，数字和字符串一一对应,字符串旋转270度
plt.xticks(_x[::3],_xticks_labels[::3],rotation=270)
plt.yticks(range(0,9))

设置了标题,xy轴的lable(title.xlable.,ylable)

1
2
3

plt.xlabel("时间")
plt.ylabel("温度 单位(℃)")
plt.title("10点到12点每分钟的气温变化情况")

设置了字体(font..manager. fontProperties.matplotlib.rc)

import matplotlib
font = {'family': 'MicroSoft YaHei',
        'weight': 'bold',
        'size': '12'}
matplotlib.rc("font", **font)

在一个图上绘制多个图形(plt多次plot即可)

#设置y坐标
y_1 = [1,0,1,1,2,4,3,2,3,4,4,5,6,5,4,3,3,1,1,1]
y_2 = [1,0,3,1,2,2,3,3,2,1,2,1,1,1,1,1,1,1,1,1]
#传入x,y绘制二维数据折线图
plt.plot(x,y_1,label="jack",color="r",linestyle=':')
plt.plot(x,y_2,label="Dawei",color="cyan",linestyle='-.')

为不同的图形添加图例

#添加图例
plt.legend()
#绘制网格 透明度为0.4以及线条样式
plt.grid(alpha=0.4,linestyle=':')

#导入pyplot 生成对象plt
from matplotlib import pyplot as plt
#设置图片大小  参数为图片宽度和高度以及每英寸的像素数（清晰度）
plt.figure(figsize=(20,8),dpi=80)
#设置x坐标 参数为数据起始横坐标,数据终止横坐标(不包括),数据横坐标步长
#图表为二维数据(x,y)这里设置二维数据的x为2，4，6----24
x = range(2,26,2)
#设置二维数据中y坐标的数据
y = [15,13,14.5,17,20,25,26,26,24,22,18,15]
#此时二维数据为（2，15）（4，13）（6，14.5）--
#传入x,y绘制二维数据折线图
plt.plot(x,y)
#保存图片 到某位置(注意/方向)
#plt.savefig("C:/Users/Administrator/2.png")
#设置x轴刻度  该函数传入x则与二维数据的x一致
plt.xticks(x) #或者plt.xticks(range(2,25))或者plt.xticks()
#设置y轴刻度 range函数为从起始到终止左闭右开默认步长为1的一组数
plt.yticks(range(min(y),max(y)+1))
#展示图形
plt.show()

绘制气温

#导入pyplot 生成对象plt
from matplotlib import pyplot as plt
import random
#设置字体为中文雅黑
import matplotlib
font = {'family': 'MicroSoft YaHei',
        'weight': 'bold',
        'size': '12'}
matplotlib.rc("font", **font)
#设置图片大小  参数为图片宽度和高度以及每英寸的像素数（清晰度）
plt.figure(figsize=(20,8),dpi=80)
#设置x坐标 参数为数据起始横坐标,数据终止横坐标(不包括),数据横坐标步长
#图表为二维数据(x,y)这里设置二维数据的x为2，4，6----24
x = range(0,120)
#设置二维数据中y坐标的数据
#设置y为20-35的一组的随机数120个
y = [random.randint(20,35) for i in range(120)]
#调整x的刻度  设置x轴显示10点和11点的中文
#_x为x每隔3个数据取一位
#_x = list(x)[::3]
_x = list(x)
#_xticks_labels显示为10点i分
_xticks_labels = ["10点{}分".format(i) for i in range(60)]
_xticks_labels += ["11点{}分".format(i) for i in range(60)]
#显示刻度为_x,显示标签为_xticks_labels
#取步长，数字和字符串一一对应,字符串旋转270度
plt.xticks(_x[::3],_xticks_labels[::3],rotation=270)
#添加描述信息
plt.xlabel("时间")
plt.ylabel("温度 单位(℃)")
plt.title("10点到12点每分钟的气温变化情况")
#传入x,y绘制二维数据折线图
plt.plot(x,y)
#展示图形
plt.show()

绘制交女朋友图

#导入pyplot 生成对象plt
from matplotlib import pyplot as plt
import random
#设置字体为中文雅黑
import matplotlib
font = {'family': 'MicroSoft YaHei',
        'weight': 'bold',
        'size': '12'}
matplotlib.rc("font", **font)
#设置图片大小  参数为图片宽度和高度以及每英寸的像素数（清晰度）
plt.figure(figsize=(20,8),dpi=80)
#设置x坐标
x = range(11,31)
#设置y坐标
y_1 = [1,0,1,1,2,4,3,2,3,4,4,5,6,5,4,3,3,1,1,1]
y_2 = [1,0,3,1,2,2,3,3,2,1,2,1,1,1,1,1,1,1,1,1]
#传入x,y绘制二维数据折线图
plt.plot(x,y_1,label="jack",color="r",linestyle=':')
plt.plot(x,y_2,label="Dawei",color="cyan",linestyle='-.')
#设置y刻度为0-8
plt.yticks(range(0,9))
#_xticks_labels显示为i岁
_xticks_labels = ["{}岁".format(i) for i in x]
#显示刻度为_x,显示标签为_xticks_labels
#取步长，数字和字符串一一对应
plt.xticks(x,_xticks_labels)
#添加描述信息
plt.xlabel("年龄")
plt.ylabel("个数 单位(个)")
plt.title("11岁到30岁交女朋友个数变化情况")
#添加图例
plt.legend()
#绘制网格 透明度为0.4以及线条样式
plt.grid(alpha=0.4,linestyle=':')
#展示图形
plt.show()

散点图

matplotlib.scatter(x,y)

from matplotlib import pyplot as plt
import random
#设置字体为中文雅黑
import matplotlib
font = {'family': 'MicroSoft YaHei',
        'weight': 'bold',
        'size': '10'}
matplotlib.rc("font", **font)
#设置图片大小  参数为图片宽度和高度以及每英寸的像素数（清晰度）
plt.figure(figsize=(20,8),dpi=80)
y_3 = [11,17,16,11,12,11,12,6,6,7,8,9,12,15,14,17,18,21,16,17,20,14,15,15,15,19,21,22,22,22,23]
y_10 = [26,26,28,19,21,17,16,19,18,20,20,19,22,23,17,20,21,20,22,15,11,15,5,13,17,10,11,13,12,13,6]
x_3 = range(1,32)
x_10 = range(51,82)
#画散点图 添加图例信息
plt.scatter(x_3,y_3,label="3月",color="orange")
plt.scatter(x_10,y_10,label="10月",color="r")
#调整x轴刻度，长度要和数据的x一样长,显示为3月i日  10月i-50日
_x = list(x_3)+list(x_10)
_xticks_labels = ["3月{}日".format(i) for i in x_3]
_xticks_labels += ["10月{}日".format(i-50) for i in x_10]
#字体旋转45度,取步长调整疏密度
plt.xticks(_x[::3],_xticks_labels[::3],rotation=45)
#添加描述信息
plt.xlabel("时间")
plt.ylabel("温度")
plt.title("3月和10月温度散点图")
#添加图例
plt.legend()
#显示图形
plt.show()

条形图

竖向matplotlib.bar(x,y)

from matplotlib import pyplot as plt
import random
#设置字体为中文雅黑
import matplotlib
font = {'family': 'MicroSoft YaHei',
        'weight': 'bold',
        'size': '10'}
matplotlib.rc("font", **font)
#设置图片大小  参数为图片宽度和高度以及每英寸的像素数（清晰度）
plt.figure(figsize=(20,10),dpi=80)
a = ["战狼2","速度与激情8","功夫瑜伽","西游伏妖篇","变形金刚5：最后的骑士","摔跤吧！爸爸","加勒比海盗5：死无对证","金刚：骷髅岛","极限特工：终极回归","生化危机6:终章","乘风破浪","神偷奶爸3","智取威虎山","大闹天竺","金刚狼3","蜘蛛侠：英雄归来","悟空传","银河护卫队2","情圣","新木乃伊"]
b = [56.01,26.94,17.53,16.49,15.45,12.96,11.8,11.61,11.28,11.12,10.49,10.3,8.75,7.55,7.32,6.99,6.88,6.86,6.58,6.23]
#绘制 x传数字列表自己根据a的长度取了一组数,方块宽度0.3
plt.bar(range(len(a)),b,width=0.3)
#调整x轴刻度，长度要和数据的x一样长,显示为a字符串列表内容
plt.xticks(range(len(a)),a,rotation=45)
#显示
plt.show()

横向matplotlib.bar(x,y)

from matplotlib import pyplot as plt
import random
#设置字体为中文雅黑
import matplotlib
font = {'family': 'MicroSoft YaHei',
        'weight': 'bold',
        'size': '10'}
matplotlib.rc("font", **font)
#设置图片大小  参数为图片宽度和高度以及每英寸的像素数（清晰度）
plt.figure(figsize=(20,10),dpi=80)
a = ["战狼2","速度与激情8","功夫瑜伽","西游伏妖篇","变形金刚5：最后的骑士","摔跤吧！爸爸","加勒比海盗5：死无对证","金刚：骷髅岛","极限特工：终极回归","生化危机6:终章","乘风破浪","神偷奶爸3","智取威虎山","大闹天竺","金刚狼3","蜘蛛侠：英雄归来","悟空传","银河护卫队2","情圣","新木乃伊"]
b = [56.01,26.94,17.53,16.49,15.45,12.96,11.8,11.61,11.28,11.12,10.49,10.3,8.75,7.55,7.32,6.99,6.88,6.86,6.58,6.23]
#绘制
plt.barh(range(len(a)),b,height=0.3,color="orange")
#调整y轴刻度，长度要和数据的a一样长,显示为a字符串列表内容
plt.yticks(range(len(a)),a)
#绘制网格
plt.grid(alpha=0.3)
#显示
plt.show()

多个条形图

from matplotlib import pyplot as plt
import random
#设置字体为中文雅黑
import matplotlib
font = {'family': 'MicroSoft YaHei',
        'weight': 'bold',
        'size': '10'}
matplotlib.rc("font", **font)
#设置图片大小  参数为图片宽度和高度以及每英寸的像素数（清晰度）
plt.figure(figsize=(20,8),dpi=80)
a = ["战狼2","速度与激情8","功夫瑜伽","西游伏妖篇"]
day_16 = [15746,312,4497,319]
day_15 = [12357,156,2045,168]
day_14 = [2358,399,2358,362]
#设置横坐标数据 x_14为第14天的横坐标
bar_width = 0.15
x_14 = list(range(len(a)))
x_15 = [i+bar_width for i in x_14]
x_16 = [i+bar_width*2 for i in x_14]
#绘制 先绘制第14天的图形，贴着第14天画第15天，贴着15天画第16天
## 添加图例
plt.bar(x_14,day_14,width=bar_width,label="9月14日")
plt.bar(x_15,day_15,width=bar_width,label="9月15日")
plt.bar(x_16,day_16,width=bar_width,label="9月16日")
#调整x轴刻度,显示为a字符串列表内容
plt.xticks(x_15,a)
#添加图例
plt.legend()
#显示
plt.show()

直方图

matplotlib.hist(data,num_bins,density =True)

from matplotlib import pyplot as plt
import random
#设置字体为中文雅黑
import matplotlib
font = {'family': 'MicroSoft YaHei',
        'weight': 'bold',
        'size': '10'}
matplotlib.rc("font", **font)
#设置图片大小  参数为图片宽度和高度以及每英寸的像素数（清晰度）
plt.figure(figsize=(20,10),dpi=80)
a = [131,  98, 125, 131, 124, 139, 131, 117, 128, 108, 135, 138, 131, 102, 107, 114, 119, 128, 121, 142, 127, 130, 124, 101, 110, 116, 117, 110, 128, 128, 115,  99, 136, 126, 134,  95, 138, 117, 111,78, 132, 124, 113, 150, 110, 117,  86,  95, 144, 105, 126, 130,126, 130, 126, 116, 123, 106, 112, 138, 123,  86, 101,  99, 136,123, 117, 119, 105, 137, 123, 128, 125, 104, 109, 134, 125, 127,105, 120, 107, 129, 116, 108, 132, 103, 136, 118, 102, 120, 114,105, 115, 132, 145, 119, 121, 112, 139, 125, 138, 109, 132, 134,156, 106, 117, 127, 144, 139, 139, 119, 140,  83, 110, 102,123,107, 143, 115, 136, 118, 139, 123, 112, 118, 125, 109, 119, 133,112, 114, 122, 109, 106, 123, 116, 131, 127, 115, 118, 112, 135,115, 146, 137, 116, 103, 144,  83, 123, 111, 110, 111, 100, 154,136, 100, 118, 119, 133, 134, 106, 129, 126, 110, 111, 109, 141,120, 117, 106, 149, 122, 122, 110, 118, 127, 121, 114, 125, 126,114, 140, 103, 130, 141, 117, 106, 114, 121, 114, 133, 137,  92,121, 112, 146,  97, 137, 105,  98, 117, 112,  81,  97, 139, 113,134, 106, 144, 110, 137, 137, 111, 104, 117, 100, 111, 101, 110,105, 129, 137, 112, 120, 113, 133, 112,  83,  94, 146, 133, 101,131, 116, 111,  84, 137, 115, 122, 106, 144, 109, 123, 116, 111,111, 133, 150]

#计算组数
bin_width = 3  #组距
num_bins = (max(a)-min(a))//bin_width #组数 整数除法
print(max(a),min(a),max(a)-min(a))
print(num_bins)
#绘制 将数据a分成num_bins个组 组数=极差/组距=max(a)-min(a)/bin_width
##density =True可以改成从频数图改成频率图
plt.hist(a,num_bins,density =True)
#调整x轴刻度 刻度设置了起始数,终止数,步长
plt.xticks(range(min(a),max(a)+bin_width,bin_width))
#网格
plt.grid()
#显示
plt.show()

思维导图

numpy读取本地数据

import numpy as np
#读取本地文件
#参数为文件地址 数据类型，分隔字符串，跳过前x行，使用哪些列,是否转置
#现在这里有一个英国和美国各自youtube1000多个视频的点击,喜欢,不喜欢,评论数量(["views","likes","dislikes","comment_total"])的csv,运用刚刚所学习的只是,我们尝试来对其进行操作
us_file_path = "./youtube_video_data/US_video_data_numbers.csv"
uk_file_path = "./youtube_video_data/GB_video_data_numbers.csv"
t1 = np.loadtxt(us_file_path,dtype="int",delimiter=",",skiprows=0,usecols=None,unpack=False)
t2 = np.loadtxt(uk_file_path,dtype="int",delimiter=",",skiprows=0,usecols=None,unpack=False)

print(t2)
print("*"*80)
#取行 [行,列] ":"代表全部 第二行 0行 1行 ---
print(t2[2])
print(t2[2,:])
#取某一行以后的所有行
print("*"*80)
print(t2[2:])
print(t2[2:,:])
#取不连续的多行 --取第2 , 8 ,10行
print("*"*80)
print(t2[[2,8,10]])
print(t2[[2,8,10],:])
#取列
#取第0列 [行,列]
print("*"*80)
print(t2[:,0])
#取连续的多列 第二列及以后的列
print("*"*80)
print(t2[:,2:])
#取不连续的多列 --取第1列和第三列
print("*"*80)
print(t2[:,[1,3]])
#取行和列,取第三行 第四列的值
print("*"*80)
print(t2[2,3])
#取多行和多列 取第三行到第五行,第二列到第四列的结果 :可以理解为到~ 切片左闭右开
print("*"*80)
print(t2[2:5,1:4])
#取多个不相邻的点 第0行0列的以及第二行一列的
print("*"*80)
print(t2[[0,2],[0,1]])

numpy数组创建

import numpy as np
import random
#将列表转换成numpy的数组类型
t1 = np.array([1,2,3])
print(t1)
print(type(t1))

#arange方法快速生成数组0-11
t3 = np.arange(12)
print(t3)

#数组存放数据的类型 dtype
print(t3.dtype)
#生成数组并且指定数据类型dtype
t4 = np.array(range(1,4),dtype="float32")
t5 = np.array([1,0,1,0,1,1],dtype="bool")
print(t4)
print(t4.dtype)
print(t5)
print(t5.dtype)
#调整数据类型 astype
t6 = t5.astype("int8")
print(t6.dtype)
#生成随机小数 random.random()默认生成[0,1)之间的小数
t7 = np.array([random.random() for i in range(10)])
print(t7)
print(t7.dtype)
#对数组里的数取两位小数round(data,num) 四舍五入
t8 = np.round(t7,2)
print(t8)

numpy数组数值修改

import numpy as np
#读取本地文件
#参数为文件地址 数据类型，分隔字符串，跳过前x行，使用哪些列,是否转置
#现在这里有一个英国和美国各自youtube1000多个视频的点击,喜欢,不喜欢,评论数量(["views","likes","dislikes","comment_total"])的csv,运用刚刚所学习的只是,我们尝试来对其进行操作
us_file_path = "./youtube_video_data/US_video_data_numbers.csv"
uk_file_path = "./youtube_video_data/GB_video_data_numbers.csv"
t1 = np.loadtxt(us_file_path,dtype="int",delimiter=",",skiprows=0,usecols=None,unpack=False)
t2 = np.loadtxt(uk_file_path,dtype="int",delimiter=",",skiprows=0,usecols=None,unpack=False)

print(t2)
print("*"*80)
#数组中小于1000的改为3
t2[t2<1000] = 3
print(t2)
#数组t2中小于10的数字替换为0,大于10的替换为20  --where
#三目运算 (条件,满足的操作,不满足的操作)
print("*"*80)
t3 = np.where(t2<10,0,20)
print(t3)
#数组t2中小于10的数字替换为0,大于20的替换为20  --where
#裁剪clip (条件,满足的操作,不满足的操作)

numpy数组维度和计算

import numpy as np
#一维数组 #arange方法快速生成数组0-11
t1 = np.arange(12)
#显示数组形状,(行数,列数)
print(t1.shape)
#二维数组
t2 = np.array([[1,2,3],[4,5,6]])
print(t2)
print(t2.shape)
#转置
print("*******")
print(t2.T)
#让数组所有元素加2
t3 = t2 + 2
print(t3)
#修改数组形状reshape t1修改为三行四列的二维数组
t4 = t1.reshape((3,4))
print(t4)
#三维数组 (块数,每块的行数,每块的列数)
t5 = np.arange(24).reshape(2,3,4)
print(t5)
#计算某行或者某列的值 t3按照行的方向对每一列求和
t6 = np.sum(t3,axis=0)
print(t6)
#t3按照列的方向对每一行求和
t7 = np.sum(t3,axis=1)
print(t7)

numpy数组nan处理

import numpy as np
#二维数组
t1 = np.array([[1,2,3],[4,5,6]])
print(t1)
#axis=0对每一列操作，axis=1对每一行操作
# 求和：t.sum(axis=None)
##计算某行或者某列的值 t1按照行的方向对每一列求和
print("*"*10+"列相加"+"*"*10)
t2 = t1.sum(axis=0)
print(t2)
# 均值：t.mean(a,axis=None)  受离群点的影响较大
print("*"*10+"均值"+"*"*10)
t3 = t1.mean(axis=0)
print(t3)
# 中值：np.median(t,axis=None)
print("*"*10+"中值"+"*"*10)
t4 = np.median(t1,axis=0)
print(t4)

# 最大值：t.max(axis=None)
# 最小值：t.min(axis=None)
# 极值：np.ptp(t,axis=None) 即最大值和最小值只差
# 标准差：t.std(axis=None)

#创建有nan的数组
t5 = np.arange(12).reshape(3,4).astype("float")
##第一行的第二列以及以后的数换成nan
t5[1,2:] = np.nan
print("*"*10+"nan数组"+"*"*10)
print(t5)
#把某列有nan的数据修改这一列的均值

作业

#英国和美国各自youtube1000的数据结合之前的matplotlib绘制出各自的评论数量的直方图
#希望了解英国的youtube中视频的评论数和喜欢数的关系，应该如何绘制改图
import numpy as np
from matplotlib import pyplot as plt
#设置字体为中文雅黑
import matplotlib
font = {'family': 'MicroSoft YaHei',
        'weight': 'bold',
        'size': '10'}
matplotlib.rc("font", **font)
#设置图片大小  参数为图片宽度和高度以及每英寸的像素数（清晰度）
plt.figure(figsize=(20,10),dpi=80)
from matplotlib import pyplot as plt
#读取本地文件
#参数为文件地址 数据类型，分隔字符串，跳过前x行，使用哪些列,是否转置
#现在这里有一个英国和美国各自youtube1000多个视频的点击,喜欢,不喜欢,评论数量(["views","likes","dislikes","comment_total"])的csv,运用刚刚所学习的只是,我们尝试来对其进行操作
us_file_path = "./youtube_video_data/US_video_data_numbers.csv"
uk_file_path = "./youtube_video_data/GB_video_data_numbers.csv"
t_us = np.loadtxt(us_file_path,dtype="int",delimiter=",",skiprows=0,usecols=None,unpack=False)
t_uk = np.loadtxt(uk_file_path,dtype="int",delimiter=",",skiprows=0,usecols=None,unpack=False)
#取评论数 最后一列
t_us_comments= t_us[:,-1]
#选择比5000小的数据
t_us_comments = t_us_comments[t_us_comments<=5000]
#看一下数据的最大值和最小值 方便绘制直方图
print(t_us_comments.max(),t_us_comments.min())
#估计
#组距bin_width
bin_width = 50
#组数为 num_bins=极差/组距=max(a)-min(a)/bin_width
num_bins = (max(t_us_comments)-min(t_us_comments))//bin_width #组数 整数除法
#绘直方图
plt.hist(t_us_comments,num_bins,density =True)
#绘制散点图 最后一列和第二列数据
##选择t_uk中喜欢数量小于500000的
t_uk = t_uk[t_uk[:,1]<=500000]
t_uk_comments = t_uk[:,-1]
t_uk_like = t_uk[:,1]
#画散点图 添加图例信息
plt.scatter(t_uk_like,t_uk_comments,color="orange")
plt.title("YouTube喜欢和评论的关系")
plt.show()

思维导图

知识点

#### numpy数组的拼接
  - np.hstack(t1,t2)
  - np.vstack(t1,t2)

#### Series如何创建，如何进行索引和切片
  - pd.Series([])
  - pd.Series({})  #字典的键就是Series的索引

  - s1["a"]
  - `s1[["a","c"]]`
  - `s1[1]`
  - `s2[[1,5,3]]`
  - s2[4:10]


#### DataFrame如何创建，如何进行索引和切片
  - `pd.DataFrame([[],[],[]])` #接收2维数组
  - pd.DataFrame({"a":[1,23],"c":[2,3]})
  - pd.DataFrame([{},{},{}])


#### DataFrame缺失数据处理
  - 0
    - 并不是所有的都需要处理
    - df[df==0] = np.nan
  - nan
    - pd.isnan ，pd.notnan
    - df.dropna(axis=0,how="any[all]",inplace=True)
    - df.fillna(df.mean())
    - df["A"] = df["A"].fillna(df["A"].mean())

#### 字符串离散化进行统计
  - 获取分类去重后的列表
  - 构造全为0的DataFrame，形状是（数据的行数，分类列表的长度），列索引是分类去重后的列表
  - 遍历原始数据，对全为0的df赋值
    - zeros_df.loc[i,["T","M"]] = 1
  - 按列进行求和

#### join
  - 按照行索引进行合并

#### merge
  - 按照某一列进行和并
  
  [
  [1,2,3],
  [4,5,6]
  ]
  [
  [10,2,31],
  [43,52,62]
  ]
  ret: 左连接
  [
  [1,2,3,10,2,31],
  [4,5,6,nan,nan,nan]
  ]

  ret: 内连接
  [
  [1,2,3,10,2,31]
  ]

  ret: 外连接
  [
  [1,2,3,10,2,31]
  [4,5,6，nan,nan,nan]
  [nan,nan,nan,43,52,62]
  ]

  ret: 右连接
  [
  [1,2,3,10,2,31]
  [nan,nan,nan,43,52,62]
  ]
  

#### 数据的分组和聚合
  - groupby(by="").count()

  - groupby(by=["",""]).count() --->返回复合索引的df

  - 可迭代




#### 索引的相关知识点
  - df.index
  - df.index = []
  - df.set_index(“a”) #把某一列作为索引
  - df.set_index([“a”,"b"]) #把某几列作为索引

  - series
    - `s1["a"]["b"]`
    - `s1["a","b"]`

  - DataFrame
    - `df.loc["a"].loc["b"]`

  - 从内层开始选择
    - df.swaplevel()

pandas读取数据和series

import pandas as pd
#创建series数组 第一列是索引，第二列是数据
t1 = pd.Series([1,2,31,12,3,4])
print(t1)
#指定索引 index  索引分别为a b c d e
t2 = pd.Series([4,22,31,12,6],index=list("abcde"))
print(t2)
#通过字典创建series  一条数据 索引:数据
##先建立字典在转换成Series数据
t_1 = {"name":"jack","age":30,"tel":10086}
t3 = pd.Series(t_1)
print("*"*60+"这是t3"+"*"*60)
print(t3)
print("*"*60)
#查看数据类型dtype
print(t2.dtype)
#查看数据
##索引  data["索引"]
print("*"*60)
print(t3["tel"])
##取第二行
print("*"*60)
print(t3[1])
##取前两行  连续  切片操作
print("*"*60)
print(t3[:2])
##取0行和第2行    切片操作 数据[[行操作],[列操作]]
print("*"*60)
print(t3[[0,2]])
print("*"*60)
print(t3[["name","tel"]])
#布尔索引
#选择大于10的数据
print("*"*60)
print(t1[t1>10])
#查看数据的索引
print("*"*60)
print(t3.index)
#用循环查看索引
for i in t3.index:
    print(i)

#查看t3索引的长度
print("*"*60)
print(len(t3.index))
#转换成列表 list
print("*"*60)
print(list(t3.index))
#取列表的前三个 切片操作[]
print("*"*60)
print(list(t3.index)[:2])
#读取csv文件
df = pd.read_csv("./dogNames2.csv")
print(df)
#读取剪切板文件
##pd.read_clipboard()
#读取MySQL
##pd.read_sql()
print(df.head())
#统计哪些狗的名字用的次数最多 第一列是名字，第二列是次数
##先按次数排序
#

pandas的DataFrame创建

import pandas as pd
import numpy as np
#创建DataFrame类型的二维数据  行索引 axis=0,列索引 axis=1
t = pd.DataFrame(np.arange(12).reshape(3,4))
print(t)
#创建DataFrame类型的二维数据  添加行索引（abc）和列索引(WXYZ)
print("*"*30+"t1"+"*"*30)
t1 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
print(t1)
#创建多条DataFrame二维数据
##创建字典 字典的键值多的时候用[,]
d1 = {"name":["xiaoming","xiaohong"],"age":[20,32],"tel":[10086,10010]}
t2 = pd.DataFrame(d1)
print("*"*30+"t2"+"*"*30)
print(t2)
##方法二 一行一行插入 每行的数据用{},{}分开
d2 = [{"name":"xiaohua","age":25,"tel":10010},{"name":"xiaoming","age":28,"tel":911}]
t3 = pd.DataFrame(d2)
print("*"*30+"t3"+"*"*30)
print(t3)

pandas的DataFrame描述信息

import pandas as pd
d1 = {"name":["xiaoming","xiaohong"],"age":[20,32],"tel":[10086,10010]}
t2 = pd.DataFrame(d1)
print("*"*30+"t2"+"*"*30)
print(t2)
#索引信息
print("*"*30+"索引信息"+"*"*30)
print(t2.index)
#列信息
print("*"*30+"列信息"+"*"*30)
print(t2.columns)
#形状
print("*"*30+"形状"+"*"*30)
print(t2.shape)
#每一列的数据类型
print("*"*30+"数据类型"+"*"*30)
print(t2.dtypes)
#数据维度
print("*"*30+"数据维度"+"*"*30)
print(t2.ndim)
#显示头几行
print("*"*30+"显示前一行"+"*"*30)
print(t2.head(1))
#显示后几行
print("*"*30+"显示后一行"+"*"*30)
print(t2.tail(1))
#显示概览
print("*"*30+"显示概览"+"*"*30)
print(t2.info())
print(t2.describe())

pandas的DataFrame的布尔和缺失值

#统计哪些狗的名字用的次数超过800 第一列是名字，第二列是次数
#bool索引 df[条件]
import pandas as pd
import numpy as np
##读取csv文件
df = pd.read_csv("./dogNames2.csv")
print(df)
print(df.head())
##超过800的--bool
print("*"*30+"超过800的"+"*"*30)
print(df[df["Count_AnimalName"]>800])
##大于800小于1000的 多条件用& | 每个条件用()
print("*"*30+"大于800小于1000的"+"*"*30)
print(df[(800<df["Count_AnimalName"])&(df["Count_AnimalName"]<1000)])
#统计使用次数超过700而且名字的字符串的长度大于4的狗的名字
print("*"*30+"用次数超过700而且名字的字符串的长度大于4"+"*"*30)
print(df[(df["Count_AnimalName"]>700)&(df["Row_Labels"].str.len()>4)])
#判断是否为nan型数据
t3 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
t3.iloc[1:,:2] = np.nan
print("*"*30+"判断t3是否为nan型数据"+"*"*30)
print(pd.isnull(t3))
#直接删除nan行或列
print("*"*30+"直接删除nan行或列"+"*"*30)
print(t3[pd.notnull(t3["W"])])
##dropna()
print(t3.dropna(axis=0))
#直接删除行中只要存在nan的行
print("*"*30+"直接删除行中只要存在nan的行"+"*"*30)
print(t3.dropna(axis=0,how="any"))
#直接删除行中一整行都是nan的行
print("*"*30+"直接删除行中一整行都是nan的行"+"*"*30)
print(t3.dropna(axis=0,how="all"))
#删除行中一整行都是nan的行并替换数据--原地修改
# print("*"*30+"直接删除行中一整行都是nan的行并"+"*"*30)
# print(t3.dropna(axis=0,how="all",inplace=True))
#填充nan的值为0
print("*"*30+"填充nan的值为0"+"*"*30)
print(t3.fillna(0))
#填充nan的值为均值
print("*"*30+"填充nan的值为均值"+"*"*30)
print(t3.fillna(t3.mean()))

d1 = {"name":["xiaoming","xiaohong"],"age":[20,32],"tel":[10086,10010]}
t2 = pd.DataFrame(d1)
t2.iloc[0,1] = np.nan
print("*"*30+"t2"+"*"*30)
print(t2)
#t2 age列中的nan填充这一列的平均值
print("*"*30+"age列中的nan填充这一列的平均值"+"*"*30)
t2["age"] = t2["age"].fillna(t2["age"].mean())
print(t2)

pandas的DataFrame应用

#统计哪些狗的名字用的次数最多 第一列是名字，第二列是次数
import pandas as pd
import numpy as np
#读取csv文件
df = pd.read_csv("./dogNames2.csv")
print(df)
print(df.head())
#排序  按照次数 参数：by按照什么属性排序(列的名字) ascending是否升序
df = df.sort_values(by="Count_AnimalName",ascending=False)
print("*"*30+"次数排序"+"*"*30)
print(df)
#取前20行 切片操作[]
print("*"*30+"取前20行"+"*"*30)
print(df[:20])
#取Count_AnimalName这一列
print("*"*30+"取一列"+"*"*30)
print(df[:20]["Row_Labels"])
#loc通过标签索引
#iloc通过位置获取数据
t3 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
print(t3)
#loc定位数据 ["行标签","列标签"]
print("*"*30+"loc定位数据"+"*"*30)
print(t3.loc["a","Z"])
##取w,z标签两列
print(t3.loc[:,["W","Z"]])
##取行标签a-c之间，列标签为W和Z的数据
print(t3.loc["a":"c",["W","Z"]])
#iloc定位数据 ["行号","列号"]
print("*"*30+"iloc定位数据"+"*"*30)
##取第一行
print(t3.iloc[1])
##取第二列
print(t3.iloc[:,2])
##更改数据 第0行第0列的数据改为2
t3.iloc[0,0] = 2
print(t3)

pandas的数据的分类与合并

# 对于这一组电影数据，如果我们希望统计电影分类(genre)的情况
# 应该如何处理数据？
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "./IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)
#统计分类的列表
#temp_list形式为[[],[],[]]
temp_list = df["Genre"].str.split(",").tolist()
#遍历temp_list去重后放到列表
genre_list = list(set([i for j in temp_list for i in j]))
#构造全为0的数组 行数等于数据的行数，列数等于分类的种类
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
print(zeros_df)
#给每个电影出现分类的位置赋值1
print("*"*30+"给每个电影出现分类的位置赋值1"+"*"*30)
for i in range(df.shape[0]):
    #zeros_df.loc[0,["Sci-fi","Mucial"]] = 1
    zeros_df.loc[i,temp_list[i]] = 1
#每个电影对应的种类下会出现1
print(zeros_df)

#统计每个分类的电影的数量
print("*"*30+"统计每个分类的电影的数量"+"*"*30)
genre_count = zeros_df.sum(axis=0)
print(genre_count)
#排序
genre_count = genre_count.sort_values()
#设置图片大小  参数为图片宽度和高度以及每英寸的像素数（清晰度）
plt.figure(figsize=(20,10),dpi=80)
#条形图
_x = genre_count.index
_y = genre_count.values
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)
plt.show()
#合并列表 join
##默认将行索引相同的数据合并在一起
df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
print("*"*30+"df1"+"*"*30)
print(df1)

df2 = pd.DataFrame(np.zeros((3,3)),index=["A","B","C"],columns=list("xyz"))
print("*"*30+"df2"+"*"*30)
print(df2)
##df2加入到df1右面，以df1为主
df1.join(df2)
print("*"*30+"df2加入到df1"+"*"*30)
print(df1.join(df2))
##df1加入到df2右面，以df2为主
df2.join(df1)
print("*"*30+"df1加入到df2"+"*"*30)
print(df2.join(df1))
#合并列表 merge
##默认将指定列的数据按照一定方式合并在一起
df3 = pd.DataFrame(np.zeros((3,3)),columns=list("fax"))
print("*"*30+"df1"+"*"*30)
print(df1)
print("*"*30+"df3"+"*"*30)
print(df3)
##内连接inner 交集
print("*"*30+"df3内连接df1以a列为参考"+"*"*30)
t1 =  df1.merge(df3,on="a",how="inner")
print(t1)
##外连接outer 并集，NaN补全
print("*"*30+"df3外连接df1以a列为参考"+"*"*30)
t2 =  df1.merge(df3,on="a",how="outer")
print(t2)
##左连接left 左边为准，NaN补全
print("*"*30+"df3左连接df1以a列为参考"+"*"*30)
t3 =  df1.merge(df3,on="a",how="left")
print(t3)
##右连接right 右边为准，NaN补全
print("*"*30+"df3右连接df1以a列为参考"+"*"*30)
t4 =  df1.merge(df3,on="a",how="right")
print(t4)

pandas的分组与聚合功能

# 现在我们有一组关于全球星巴克店铺的统计数据，
# 如果我想知道美国的星巴克数量和中国的哪个多，
# 或者我想知道中国每个省份星巴克的数量的情况，那么应该怎么办？
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
#显示不全用set_option
pd.set_option("display.max_columns",None)
file_path = "./starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
# print(df.head(1))
# print(df.info())
#按照国家进行分组
group_country = df.groupby(by="Country")
print(group_country)
#group_country是DataFrameGroupBy类型
##可以进行遍历 取里面的值,这里是元组，第一个元素为国家，第二个元素为DataFrame类型数据
for i in group_country:
    print(i)
    print("*"*20)
#查找US国家的数据
t1 = df[df["Country"]=="US"]
print("*"*30+"查找US国家的数据"+"*"*30)
print(t1)
##可以调用聚合方法统计
print("*"*30+"调用聚合方法"+"*"*30)
print(group_country.count())
##只统计brand一列
print("*"*30+"只统计brand一列"+"*"*30)
print(group_country["Brand"].count())
#Brand没有缺失
#统计每个国家的星巴克门店数量
country_count = group_country["Brand"].count()
#美国星巴克数量   取行索引找个国家的星巴克门店数量
print("*"*30+"美国星巴克数量"+"*"*30)
print(country_count["US"])
#中国星巴克数量
print("*"*30+"中国星巴克数量"+"*"*30)
print(country_count["CN"])
#统计中国每个省份的星巴克数量
##取数据里面中国的数据
china_data = df[df["Country"]=="CN"]
print("*"*30+"中国星巴克数据"+"*"*30)
print(china_data)
##取中国数据中省份的数据  根据State/Province这一列聚合
group_Province = china_data.groupby(by="State/Province")["Brand"].count()

print("*"*30+"中国省份星巴克数据"+"*"*30)
print(group_Province)
#数据按照多个条件进行分组后取Brand这列 --结果是带有两个索引和一列数据
group_two_condition = df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count()
print("*"*30+"按照多个条件进行分组后取Brand这列"+"*"*30)
print(group_two_condition)
##[[]]这样的取法可以变成DataFrame类型

group_two_condition1 = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
print("*"*30+"变成DataFrame类型"+"*"*30)
print(group_two_condition1)
#索引
print("*"*30+"df1"+"*"*30)
df1 = pd.DataFrame(np.arange(8).reshape(2,4),index=list("ab"),columns=list("abcd"))
print(df1)
#重新设置index --将原有的idex行留下，没有的设置为空
print("*"*30+"df1的reindex"+"*"*30)
print(df1.reindex(["a","f"]))
#指定某一列为索引  若需要保留这一列drop改为False
print("*"*30+"指定某一列为索引"+"*"*30)
t2 = df1.set_index("a")
print(t2)
print("*"*30+"指定某一列为索引并保留这一列"+"*"*30)
t3 = df1.set_index("a",drop=False)
print(t3)
#返回index的唯一值 unique
print("*"*30+"返回index的唯一值"+"*"*30)
t4 = df1.set_index("b").index.unique()
print(t4)

pandas统计电影实例

# 假设现在我们有一组从2006年到2016年1000部最流行的电影数据，
# 我们想知道这些电影数据中评分的平均分，导演的人数等信息，我们应该怎么获取？
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "./IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)
#查看描述信息
print("*"*30+"查看描述信息"+"*"*30)
print(df.info())
#打印第一行
print("*"*30+"打印第一行"+"*"*30)
print(df.head(1))
#获取电影平均评分
print("*"*30+"获取电影平均评分"+"*"*30)
print(df["Rating"].mean())
#获取导演人数  set不重复集合,tolist转换成列表
print("*"*30+"获取导演人数"+"*"*30)
print(len(set(df["Director"].tolist())))
print(df["Director"].tolist())
#获取演员人数
print("*"*30+"获取演员人数"+"*"*30)
# print(df["Actors"])
#先把每一行的Actors所在列的字符串转换成小列表，在把每个小列表组合成大列表
temp_actors_list = df["Actors"].str.split(",").tolist()
print(df["Actors"].str.split(",").tolist())
#列表是两层所以找最小元素是需要两层循环temp_actors_list大列表
actors_list = [i for j in temp_actors_list for i in j]
print(actors_list)
actors_num = len(set(actors_list))
print(actors_num)
#电影时长的最大值
max_runtime = df["Runtime (Minutes)"].max()
print("*"*30+"电影时长的最大值"+"*"*30)
print(max_runtime)
#电影时长的最大值对应的索引
max_runtime_index = df["Runtime (Minutes)"].max()
print("*"*30+"电影时长的最大值对应的索引"+"*"*30)
print(max_runtime_index)

pandas的时间序列上

# 现在我们有2015到2017年25万条911的紧急电话的数据，
# 请统计出出这些数据中不同类型的紧急情况的次数，
# 如果我们还想统计出不同月份不同类型紧急电话的次数的变化情况，
# 应该怎么做呢
import pandas as pd
import numpy as np
pd.set_option("display.max_columns",None)
df = pd.read_csv("./911.csv")
print(df.head(10))
print(df.info())
#分类在title里,ems是紧急情况，fire是火灾
#获取分类情况
##字符串切割获取title里的分类
print("*"*30+"data_categories"+"*"*30)
data_categories_temp = df["title"].str.split(": ").tolist()
##取第每个元素中第0个数据
data_categories = list(set([i[0] for i in data_categories_temp]))
print(data_categories)
##构造一个全为0的数组来统计分类
zero_array = pd.DataFrame(np.zeros((df.shape[0],len(data_categories))),columns=data_categories)
print("*"*30+"造一个全为0的数组来统计分类"+"*"*30)
print(zero_array)
print("*"*30+"title包含上面的分类时，给上面的0数组赋值为1"+"*"*30)
##赋值 --当title包含上面的分类时，给上面的0数组赋值为1
##第一个[]因为DataFrame可以直接选择列，变成Series
##第二个[]是bool索引，查找满足条件的行
for i in data_categories:
    zero_array[i][df["title"].str.contains(i)] = 1
print(zero_array)
print("*"*50)
# 统计出出这些数据中不同类型的紧急情况的次数
t1 = zero_array.sum(axis=0)
print(t1)

pandas的时间序列下

# coding=utf-8
#911数据中不同月份不同类型的电话的次数的变化情况
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#把时间字符串转为时间类型设置为索引
df = pd.read_csv("./911.csv")
df["timeStamp"] = pd.to_datetime(df["timeStamp"])

#添加列，表示分类
temp_list = df["title"].str.split(": ").tolist()
cate_list = [i[0] for i in temp_list]
# print(np.array(cate_list).reshape((df.shape[0],1)))
df["cate"] = pd.DataFrame(np.array(cate_list).reshape((df.shape[0],1)))

df.set_index("timeStamp",inplace=True)

print(df.head(1))

plt.figure(figsize=(20, 8), dpi=80)

#分组
for group_name,group_data in df.groupby(by="cate"):

    #对不同的分类都进行绘图
    count_by_month = group_data.resample("M").count()["title"]

    # 画图
    _x = count_by_month.index
    print(_x)
    _y = count_by_month.values

    _x = [i.strftime("%Y%m%d") for i in _x]

    plt.plot(range(len(_x)), _y, label=group_name)


plt.xticks(range(len(_x)), _x, rotation=45)
plt.legend(loc="best")
plt.show()

pandas PM2.5案例

# coding=utf-8
import pandas as pd
from matplotlib import  pyplot as plt
file_path = "./BeijingPM20100101_20151231.csv"

df = pd.read_csv(file_path)

#把分开的时间字符串通过periodIndex的方法转化为pandas的时间类型
period = pd.PeriodIndex(year=df["year"],month=df["month"],day=df["day"],hour=df["hour"],freq="H")
df["datetime"] = period
# print(df.head(10))

#把datetime 设置为索引
df.set_index("datetime",inplace=True)

#进行降采样
df = df.resample("7D").mean()
print(df.head())
#处理缺失数据，删除缺失数据
# print(df["PM_US Post"])

data  =df["PM_US Post"]
data_china = df["PM_Nongzhanguan"]

print(data_china.head(100))
#画图
_x = data.index
_x = [i.strftime("%Y%m%d") for i in _x]
_x_china = [i.strftime("%Y%m%d") for i in data_china.index]
print(len(_x_china),len(_x_china))
_y = data.values
_y_china = data_china.values


plt.figure(figsize=(20,8),dpi=80)

plt.plot(range(len(_x)),_y,label="US_POST",alpha=0.7)
plt.plot(range(len(_x_china)),_y_china,label="CN_POST",alpha=0.7)

plt.xticks(range(0,len(_x_china),10),list(_x_china)[::10],rotation=45)

plt.legend(loc="best")

plt.show()

代码软件下载