virtualenv 是由 Ian Bicking 启动的一个开源项目,它使开发者能够隔离每个项目的工作环境,以便于不同包版本的维护。
怎么做
通过执行以下步骤,您可以安装 virtualenv 和 virtualenvwrapper 工具:
您可能想要将以下行添加到您的 ~/.bashrc文件中:
source /usr/local/bin/virtualenvwrapper.sh
现在可以在 virt1 里面安装我们喜欢的包了:
(virt1)user1:~$ pip install matplotlib
安装 virtualenv 和 virtualenvwrapper:
$ sudo pip install virtualenv
$ sudo pip install virtualenvwrapper
# Create folder to hold all our virtual environments and export the path to it.
$ export VIRTENV=~/.virtualenvs
$ mkdir -p $VIRTENV# We source (ie. execute) shell script to activate the wrappers
$ source /usr/local/bin/virtualenvwrapper.sh
# And create our first virtual environment
$ mkvirtualenv virt1
少数有用的和最常用的命令如下:
mkvirtualenv ENV:此创建名为 ENV 的虚拟环境并激活
workon ENV:这激活之前创建的 ENV
deactivate:这个让我们脱离了当前的虚拟环境
在 Mac OS X 上安装 matplotlib
在 Mac OS X 上获取 matplotlib 最简单的方法是使用预打包的 python 发行版,如 EPD。去 EPD 网站下载安装最新稳定版你的 OS 就行了。
import matplotlib.pyplot as plt
import numpy as np
t = np.arange(0.0,1.0,0.01)
s = np.sin(2* np.pi * t)# make line red
plt.rcParams['lines.color']='r'
plt.plot(t,s)
c = np.cos(2* np.pi * t)# make line thick
plt.rcParams['lines.linewidth']= '3
plt.plot(t,c)
plt.show()
import csv
filename = 'ch02-data.csv'
data = []
try:
withopen(filename) as f:
reader = csv.reader(f)
header = reader.next()
data = [row for row in reader]
except csv.Error as e:
print"Error reading CSV file at line %s: %s"%(reader.line_num, e)
sys.exit(-1)
if header:
print header
print'=================='for datarow in data:
print datarow
import xlrd
file='ch02-xlsxdata.xlsx'
wb = xlrd.open_workbook(filename=file)
ws = wb.sheet_by_name('Sheet1')
dataset = []
for r in xrange(ws.nrows):
col = []
for c inrange(ws.ncols):
col.append(ws.cell(r, c).value)
dataset.append(col)
from pprint import pprint
pprint(dataset)
import struct
import string
datafile = 'ch02-fixed-width-1M.data'# this is where we define how to# understand line of data from the file
mask='9s14s5s'withopen(datafile,'r') as f:
for line in f:
fields = struct.Struct(mask).unpack_from(line)
print'fields: ',[field.strip()for field in fields]
import csv
filename = 'ch02-data.tab'
data = []
try:
withopen(filename) as f:
reader = csv.reader(f, dialect=csv.excel_tab)
header = reader.next()
data = [row for row in reader]
except csv.Error as e:
print"Error reading CSV file at line %s: %s"%(reader.line_num, e)
sys.exit(-1)
if header:
print header
print'==================='for datarow in data:
print datarow
if __name__ =='__main__':
# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument("import_file",help="Path to a fixed-width data file.")
parser.add_argument("export_format",help="Export format: json, csv, xlsx.")
args = parser.parse_args()
if args.import_file isNone:
print >> sys.stderr,"You must specify path to import from."
sys.exit(1)
if args.export_format notin ('csv','json','xlsx'):
print >> sys.stderr,"You must provide valid export file format."
sys.exit(1)
# verify given path is accesible fileifnot os.path.isfile(args.import_file):
print >> sys.stderr,"Given path is not a file: %s"% args.import_file
sys.exit(1)
# read from formated fixed-width file
data = import_data(args.import_file)
# export data to specified format# to make this Unix-lixe pipe-able# we just print to stdoutprint write_data(data, args.export_format)
我们单独为每个数据格式 (CSV、JSON 和 XLSX) 指定单独实现。
defwrite_csv(data):
'''Transforms data into csv. Returns csv as string.'''# Using this to simulate file IO,# as csv can only write to files.
f = StringIO.StringIO()
writer = csv.writer(f)
for row in data:
writer.writerow(row)
# Get the content of the file-like objectreturn f.getvalue()
defwrite_json(data):
'''Transforms data into json. Very straightforward.'''
j = json.dumps(data)
return j
defwrite_xlsx(data):
'''Writes data into xlsx file.'''from xlwt import Workbook
book = Workbook()
sheet1 = book.add_sheet("Sheet 1")
row = 0for line in data:
col = 0for datum in line:
print datum
sheet1.write(row, col, datum)
col += 1
row += 1# We have hard limit here of 65535 rows# that we are able to save in spreadsheet.if row > 65535:
print >> sys.stderr,"Hit limit of # of rows in one sheet (65535)."break# XLS is special case where we have to# save the file and just return 0
f = StringIO.StringIO()
book.save(f)
return f.getvalue()
import sqlite3
import sys
iflen(sys.argv)<2:
print"Error: You must supply at least SQL script."print"Usage: %s table.db ./sql-dump.sql"%(sys.argv[0])
sys.exit(1)
script_path = sys.argv[1]
iflen(sys.argv)==3:
db = sys.argv[2]
else:
# if DB is not defined# create memory database
db = ":memory:"try:
con = sqlite3.connect(db)
with con:
cur = con.cursor()
withopen(script_path,'rb') as f:
cur.executescript(f.read())
except sqlite3.Error as err:
print"Error occured: %s"% err
将数据导入数据库后,我们就可以查询数据并进行一些处理。下面是从数据库文件中读取数据的代码:
import sqlite3
import sys
iflen(sys.argv)!=2:
print"Please specify database file."
sys.exit(1)
db = sys.argv[1]
try:
con = sqlite3.connect(db)
with con:
cur = con.cursor()
query = 'SELECT ID, Name, Population FROM City ORDER BY Population DESC LIMIT 1000'
con.text_factory = str
cur.execute(query)
resultset = cur.fetchall()
# extract column names
col_names = [cn[0]for cn in cur.description]
print"%10s %30s %10s"%tuple(col_names)
print"="*(10+1+30+1+10)
for row in resultset:
print"%10s %30s %10s"% row
except sqlite3.Error as err:
print"[ERROR]:", err
import numpy as np
import matplotlib.pyplot as plt
defis_outlier(points, threshold=3.5):
""" Returns a boolean array with True if points are outliers and False otherwise.
Data points with a modified z-score greater than this # value will be classified as outliers."""# transform into vectoriflen(points.shape)==1:
points = points[:,None]
# compute median value
median = np.median(points, axis=0)
# compute diff sums along the axis
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
# compute MAD
med_abs_deviation = np.median(diff)
# compute modified Z-score# http://www.itl.nist.gov/div898/handbook/eda/section4/eda43.htm#Iglewicz
modified_z_score = 0.6745* diff / med_abs_deviation
# return a mask for each outlierreturn modified_z_score > threshold
# Random data
x = np.random.random(100)
# histogram buckets
buckets = 50# Add in a few outliers
x = np.r_[x,-49,95,100,-100]
# Keep valid data points# Note here that# "~" is logical NOT on boolean numpy arrays
filtered = x[~is_outlier(x)]
# plot histograms
plt.figure()
plt.subplot(211)
plt.hist(x, buckets)
plt.xlabel('Raw')
plt.subplot(212)
plt.hist(filtered, buckets)
plt.xlabel('Cleaned')
plt.show()
from pylab import*
# fake up some data
spread= rand(50)*100
center = ones(25)*50# generate some outliers high and low
flier_high = rand(10)*100+100
flier_low = rand(10)*-100# merge generated data set
data = concatenate((spread, center, flier_high, flier_low),0)
subplot(311)
# basic plot# 'gx' defining the outlier plotting properties
boxplot(data,0,'gx')
# compare this with similar scatter plot
subplot(312)
spread_1 = concatenate((spread, flier_high, flier_low),0)
center_1 = ones(70)*25
scatter(center_1, spread_1)
xlim([0,50])
# and with another that is more appropriate for# scatter plot
subplot(313)
center_2 = rand(70)*50
scatter(center_2, spread_1)
xlim([0,50])
show()
我们还可以看到,在散点图中显示类似数据集的第二个图不是很直观,因为 x 轴的所有值都在 25,我们并没有真正区分内联和外联。
第三个图,我们在 x 轴上生成的值为分布在从 0 到 50 的范围内,让我们更清楚地看到不同的值,我们可以看到哪些值是 y 轴上的异常值。
# generate uniform data points
x = 1e6*rand(1000)
y = rand(1000)
figure()
# crate first subplot
subplot(211)
# make scatter plot
scatter(x, y)
# limit x axis
xlim(1e-6,1e6)
# crate second subplot
subplot(212)
# make scatter plot
scatter(x,y)
# but make x axis logarithmic
xscale('log')
# set same x axis limit
xlim(1e-6,1e6)
show()
import sys
filename = sys.argv[1]# must pass valid file namewithopen(filename,'rb') as hugefile:
chunksize = 1000
readable = ''# if you want to stop after certain number of blocks# put condition in the whilewhile hugefile:
# if you want to start not from 1st byte# do a hugefile.seek(skipbytes) to skip# skipbytes of bytes from the file start
start = hugefile.tell()
print"starting at:", start
file_block = ''# holds chunk_size of linesfor _ in xrange(start, start + chunksize):
line = hugefile.next()
file_block = file_block + line
print'file_block',type(file_block), file_block
readable = readable + file_block
# tell where are we in file# file IO is usually buffered so tell()# will not be precise for every read.
stop = hugefile.tell()
print'readable',type(readable), readable
print'reading bytes from %s to %s'%(start, stop)
print'read bytes total:',len(readable)
# if you want to pause read between chucks# uncomment following line#raw_input()
import time
import os
import sys
iflen(sys.argv)!=2:
print >> sys.stderr,"Please specify filename to read"
filename = sys.argv[1]
ifnot os.path.isfile(filename):
print >> sys.stderr,"Given file: \"%s\" is not a file"% filename
withopen(filename,'r') as f:
# Move to the end of file
filesize = os.stat(filename)[6]
f.seek(filesize)
# endlessly loopwhileTrue:
where = f.tell()
# try reading a line
line = f.readline()
# if empty, go backifnot line:
time.sleep(1)
f.seek(where)
else:
# , at the end prevents print to add newline, as readline()# already read that.print line,
将图像数据导入 NumPy 数组
我们将在演示如何使用 Python 的库如 NumPy 和 SciPy 来进行图像处理。
在科学的计算中,图像通常被视为 n 维数组。它们通常是二维数组;在我们的示例中,它们被表示为 NumPy 数组数据结构。因此,在这些结构上执行的功能和操作被视为矩阵操作。
从这个意义上说,图像并不总是二维的。对于医学或生物科学,图像是更高维度的数据结构,例如 3D(以 z 轴作为深度或时间轴) 或 4D(以三个空间维度和一个时间维度作为第四维度)。我们不会在这个食谱中使用这些。
import scipy.misc
import matplotlib.pyplot as plt
# load already prepared ndarray from scipy
lena = scipy.misc.lena()
# set the default colormap to gray
plt.gray()
plt.imshow(lena)
plt.colorbar()
plt.show()
import matplotlib.pyplot as plt
import scipy
import numpy
bug = scipy.misc.imread('stinkbug1.png')
# if you want to inspect the shape of the loaded image# uncomment following line#print bug.shape# the original image is RGB having values for all three# channels separately. We need to convert that to greyscale image# by picking up just one channel.# convert to gray
bug = bug[:,:,0]
import pylab
import random
SAMPLE_SIZE = 100# seed random generator# if no argument provided# uses system current time
random.seed()
# store generated random values here
real_rand_vars = []
# pick some random values
real_rand_vars = [random.random()for val in xrange(SIZE)]
# create histogram from data in 10 buckets
pylab.hist(real_rand_vars,10)
# define x and y labels
pylab.xlabel("Number range")
pylab.ylabel("Count")
# show figure
pylab.show()
import pylab
import random
# days to generate data for
duration = 100# mean value
mean_inc = 0.2# standard deviation
std_dev_inc = 1.2# time series
x = range(duration)
y = []
price_today = 0for i in x:
next_delta = random.normalvariate(mean_inc, std_dev_inc)
price_today += next_delta
y.append(price_today)
pylab.plot(x,y)
pylab.xlabel("Time")
pylab.xlabel("Time")
pylab.ylabel("Value")
pylab.show()
# coding: utf-8import random
import matplotlib
import matplotlib.pyplot as plt
SAMPLE_SIZE = 1000# histogram buckets
buckets = 100
plt.figure()
# we need to update font size just for this example
matplotlib.rcParams.update({'font.size':7})
plt.subplot(621)
plt.xlabel("random.random")
# Return the next random floating point number in the range [0.0, 1.0).
res = [random.random()for _ in xrange(1, SAMPLE_SIZE)]
plt.hi
对于第二个图,我们绘制了一个均匀分布的随机变量。
plt.subplot(622)
plt.xlabel("random.uniform")
# Return a random floating point number N such that a <= N <= b for a <= b and b <= N <= a for b < a.# The end-point value b may or may not be included in the range depending on floating-point rounding in the equation a + (b-a) * random().
a = 1
b = SAMPLE_SIZE
res = [random.uniform(a, b)for _ in xrange(1, SAMPLE_SIZE)]
plt.hist(res, buckets)
第三个图是三角形分布。
plt.subplot(623)
plt.xlabel("random.triangular")
# Return a random floating point number N such that low <= N <= high and with the specified # mode between those bounds. The low and high bounds default to zero and one. The mode # argument defaults to the midpoint between the bounds, giving a symmetric distribution.
low = 1
high = SAMPLE_SIZE
res = [random.triangular(low, high)for _ in xrange(1, SAMPLE_SIZE)]
plt.hist(res, buckets)
第四个图是贝塔分布。参数的条件是α和β应该大于零。返回值的范围在 0 和 1 之间。
plt.subplot(624)
plt.xlabel("random.betavariate")
alpha = 1
beta = 10
res = [random.betavariate(alpha, beta)for _ in xrange(1, SAMPLE_SIZE)]
plt.hist(res, buckets)
如果我们想有一些随机词,最简单的方法 (在 Linux 上) 可能是使用 /usr/share/dicts/words。我们可以在下面的例子中看到这是如何做到的:
import random
withopen('/usr/share/dict/words','rt') as f:
words = f.readlines()
words = [w.rstrip()for w in words]
for w in random.sample(words,5):
print w
这个解决方案只适用于 Unix,不会在 Windows 上运行 (不过会在 Mac OS 上运行)。对于 Windows,您可以使用从各种免费来源构建的文件 (古登堡计划、维基词典、英国国家语料库或彼得·诺维格博士的网站)。
import numpy
from numpy import*
from pylab import*
# possible window type
WINDOWS = ['flat','hanning','hamming','bartlett','blackman']
# if you want to see just two window type, comment previous line,# and uncomment the following one# WINDOWS = ['flat', 'hanning']defsmooth(x, window_len=11, window='hanning'):
""" Smooth the data using a window with requested size. Returns smoothed signal.
x -- input signal
window_len -- lenght of smoothing window
window -- type of window: 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
flat window will produce a moving average smoothing."""if x.ndim !=1:
raise ValueError,"smooth only accepts 1 dimension arrays."if x.size < window_len:
raise ValueError,"Input vector needs to be bigger than window size."if window_len <3:
return x
ifnot window in WINDOWS:
raise ValueError("Window is one of 'flat', 'hanning', 'hamming', """"'bartlett', 'blackman'")
# adding reflected windows in front and at the end
s=numpy.r_[x[window_len-1:0:-1], x, x[-1:-window_len:-1]]
# pick windows type and do averaging
if window =='flat':
#moving average
w = numpy.ones(window_len,'d')
else:
# call appropriate function in numpy
w =eval('numpy.'+ window +'(window_len)')
# NOTE: length(output) != length(input), to correct this:
# return y[(window_len/2-1):-(window_len/2)] instead of just y.
y = numpy.convolve(w/w.sum(), s, mode='valid')
return y
# Get some evenly spaced numbers over a specified interval.
t = linspace(-4,4,100)
# Make some noisy sinusoidal
x = sin(t)
xn = x + randn(len(t))*0.1
# Smooth it
y = smooth(x)
# windows
ws = 31
subplot(211)
plot(ones(ws))
# draw on the same axes
hold(True)
# plot for every windows
for w in WINDOWS[1:]:
eval('plot('+w+'(ws) )')
# configure axis properties
axis([0,30,0,1.1])
# add legend for every window
legend(WINDOWS)
title("Smoothing windows")
# add second plot
subplot(212)
# draw original signal
plot(x)
# and signal with added noise
plot(xn)
# smooth signal with noise for every possible windowing algorithm
for w in WINDOWS:
plot(smooth(xn,10, w))
# add legend for every graph
l=['original signal','signal with noise']
l.extend(WINDOWS)
legend(l)
title("Smoothed signal")
show()
import numpy as np
import pylab as p
import scipy.signal as signal
# get some linear data
x = np.linspace (0,1,101)
# add some noisy signal
x[3::10]=1.5
p.plot(x)
p.plot(signal.medfilt(x,3))
p.plot(signal.medfilt(x,5))
p.legend(['original signal','length 3','length 5'])
p.show ()
from matplotlib.pyplot import*
# some simple data
x = [1,2,3,4]
y = [5,4,3,2]
# create new figure
figure()
# divide subplots into 2 x 3 grid# and select #1
subplot(231)
plot(x, y)
# select #2
subplot(232)
bar(x, y)
# horizontal bar-charts
subplot(233)
barh(x, y)
# create stacked bar charts
subplot(234)
bar(x, y)
# we need more data for stacked bar charts
y1 = [7,8,5,3]
bar(x, y1, bottom=y, color ='r')
# box plot
subplot(235)
boxplot(x)
# scatter plot
subplot(236)
scatter(x,y)
show()
import matplotlib.pyplot as pl
import numpy as np
x = np.linspace(-np.pi, np.pi,256, endpoint=True)
y = np.cos(x)
y1 = np.sin(x)
pl.plot(x,y)
pl.plot(x, y1)
pl.show()
按照这个简单的图,我们可以定制更多,以提供更多信息,并更精确地了解轴和边界:
from pylab import*
import numpy as np
# generate uniformly distributed# 256 points from -pi to pi, inclusive
x = np.linspace(-np.pi, np.pi,256, endpoint=True)
# these are vectorised versions# of math.cos, and math.sin in built-in Python maths# compute cos for every x
y = np.cos(x)
# compute sin for every x
y1 = np.sin(x)
# plot cos
plot(x, y)
# plot sin
plot(x, y1)
# define plot title
title("Functions $\sin$ and $\cos$")
# set x limit
xlim(-3.0,3.0)
# set y limit
ylim(-1.0,1.0)
# format ticks at specific values
xticks([-np.pi,-np.pi/2,0, np.pi/2, np.pi],[r'$-\pi$',r'$-\pi/2$',r'$0$',r'$+\pi/2$',r'$+\pi$'])
yticks([-1,0,+1],[r'$-1$',r'$0$',r'$+1$'])
show()
如果我们想在当前图形中只添加一行,可以使用 matplotlib.pyploy.axhline() 或 matplotlib.pyplot.axvline()。功能 axhilne() 和 axvline() 将分别为给定的 x 和 y 数据值绘制横轴和纵轴。它们共享相似的参数,最重要的是 y 位置、xmin,以及用于 axhline() 和 x 位置、ymin 的 xmax,以及用于 axvline() 的 ymax。
让我们看一下它的外观,在同一个 IPython 会话中继续:
In [3]: axhline()
Out[3]: <matplotlib.lines.Line2D at 0x414ecd0>
In [4]: axvline()
Out[4]: <matplotlib.lines.Line2D at 0x4152490>
In [5]: axhline(4)
Out[5]: <matplotlib.lines.Line2D at 0x4152850>
from pylab import*
# get current axis
ax = gca()
# set view to tight, and maximum number of tick intervals to 10
ax.locator_params(tight=True, nbins =10)
# generate 100 normal distribution values
ax.plot(np.random.normal(10,.1,100))
show()
我们看到 x 轴和 y 轴是如何划分的,显示了哪些值。我们本可以使用定位器类实现相同的设置。这里我们说的是'将主定位器设置为为 10 的倍数':
from pylab import*
import matplotlib as mpl
import datetime
fig = figure()
# get current axis
ax = gca()
# set some daterange
start = datetime.datetime(2013,01,01)
stop = datetime.datetime(2013,12,31)
delta = datetime.timedelta(days =1)
# convert dates for matplotlib
dates = mpl.dates.drange(start, stop, delta)
# generate some random values
values = np.random.rand(len(dates))
ax = gca()
# create plot with dates
ax.plot_date(dates, values, linestyle='-', marker='')
# specify formater
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
# apply formater
ax.xaxis.set_major_formatter(date_format)
# autoformat date labels# rotates labels by 30 degrees by default# use rotate param to specify different rotation degree# use bottom param to give more room to date labels
fig.autofmt_xdate()
show()
import numpy as np
import matplotlib.pyplot as plt
mu = 100
sigma = 15
x = np.random.normal(mu, sigma,10000)
ax = plt.gca()
# the histogram of the data
ax.hist(x, bins=35, color='r')
ax.set_xlabel('Values')
ax.set_ylabel('Frequency')
ax.set_title(r'$\mathrm{Histogram:}\ \mu=%d,\ \sigma=%d$'%(mu, sigma))
plt.show()
这为我们的数据样本创建了一个整洁的红色直方图。
它是如何工作的
我们从生成一些正态分布的数据开始。直方图以指定的箱数 (35 个) 绘制,并通过将 normed 设置为 True(或 1) 进行归一化;我们将 color 设置为 red (r)。
import numpy as np
import matplotlib.pyplot as plt
# generate number of measurements
x = np.arange(0,10,1)
# values computed from "measured"
y = np.log(x)
# add some error samples from standard normal distribution
xe = 0.1* np.abs(np.random.randn(len(y)))
# draw and show errorbar
plt.bar(x, y, yerr=xe, width=0.4, align='center', ecolor='r', color='cyan', label='experiment #1');
# give some explainations
plt.xlabel('# measurement')
plt.ylabel('Measured values')
plt.title('Measurements')
plt.legend(loc='upper left')
plt.show()
from pylab import*
# make a square figure and axes
figure(1, figsize=(6,6))
ax = axes([0.1,0.1,0.8,0.8])
# the slices will be ordered# and plotted counter-clockwise.
labels ='Spring','Summer','Autumn','Winter'# fractions are either x/sum(x) or x if sum(x) <= 1
x =[15,30,45,10]
# explode must be len(x) sequence or None
explode=(0.1,0.1,0.1,0.1)
pie(x, explode=explode, labels=labels, autopct='%1.1f%%', startangle=67)
title('Rainy days by season')
show()
如果我们不指定 startangle,分数将从 x 轴 (角度 0) 逆时针开始排序。如果我们将 90 指定为 startangle 的值,那么饼图将从 y 轴开始。
这是生成的饼图。
绘制填充区域
在本食谱中,我们将向您展示如何填充曲线下或两条不同曲线之间的区域。
怎么做
下面是一个如何填充两个轮廓之间区域的示例:
from matplotlib.pyplot import figure, show, gca
import numpy as np
x = np.arange(0.0,2,0.01)
# two different signals are measured
y1 = np.sin(2*np.pi*x)
y2 = 1.2*np.sin(4*np.pi*x)
fig = figure()
ax = gca()
# plot and# fill between y1 and y2 where a logical condition is met
ax.plot(x, y1, x, y2, color='black')
ax.fill_between(x, y1, y2, where=y2>=y1, facecolor='darkblue', interpolate=True)
ax.fill_between(x, y1, y2, where=y2<=y1, facecolor='deeppink', interpolate=True)
ax.set_title('filled between')
show()