Python 数据可视化秘籍（一）

Python 数据可视化秘籍（一） | 极客日志

$ sudo apt-get install build-dep python-matplotlib

$ su -c 'yum-builddep python-matplotlib'

$ python -c 'import numpy; print numpy.__version__'

$ sudo apt-get install python-numpy

# in your terminal, type: $ sudo apt-get install python-numpy python-matplotlib python-scipy

$ cd ~/Downloads/
$ wget https://github.com/downloads/matplotlib/matplotlib/matplotlib-1.2.0.tar.gz
$ tar xzf matplotlib-1.2.0.tar.gz
$ cd matplotlib-1.2.0
$ python setup.py build
$ sudo python setup.py install

source /usr/local/bin/virtualenvwrapper.sh

(virt1)user1:~$ pip install matplotlib

$ sudo pip install virtualenv
$ sudo pip install virtualenvwrapper
# Create folder to hold all our virtual environments and export the path to it.
$ export VIRTENV=~/.virtualenvs
$ mkdir -p $VIRTENV
# We source (ie. execute) shell script to activate the wrappers
$ source /usr/local/bin/virtualenvwrapper.sh
# And create our first virtual environment
$ mkvirtualenv virt1

import numpy
print numpy.__version__
import scipy
print scipy.__version__
quit()

pip install numpy
brew install gfortran
pip install scipy

pip install virtualenv
pip install virtualenvwrapper

$ easy_install pip

export PATH=/usr/local/share/python:/usr/local/bin:$PATH

brew install python --framework --universal

export PATH=/usr/local/bin:$PATH

ruby <(curl -fsSkL raw.github.com/mxcl/homebrew/go)

$ sudo apt-get build-dep python-imaging
$ sudo pip install http://effbot.org/downloads/Imaging-1.1.7.tar.gz

# yum install python-imaging
# yum install freetype-devel
# pip install PIL

$ pip install requests

import requests
r = requests.get('http://github.com/timeline.json')
print r.content

import matplotlib as mp
mpl.rcParams['lines.linewidth']=2
mpl.rcParams['lines.color']='r'

import matplotlib as mpl
mpl.rc('lines', linewidth=2, color='r')

import matplotlib.pyplot as plt
import numpy as np
t = np.arange(0.0,1.0,0.01)
s = np.sin(2* np.pi * t)# make line red
plt.rcParams['lines.color']='r'
plt.plot(t,s)
c = np.cos(2* np.pi * t)# make line thick
plt.rcParams['lines.linewidth']= '3
plt.plot(t,c)
plt.show()

$ python -c 'import matplotlib as mpl; print mpl.get_configdir()'

import csv
filename = 'ch02-data.csv'
data = []
try:
    with open(filename) as f:
        reader = csv.reader(f)
        header = reader.next()
        data = [row for row in reader]
except csv.Error as e:
    print "Error reading CSV file at line %s: %s"%(reader.line_num, e)
    sys.exit(-1)
if header:
    print header
    print '=================='
for datarow in data:
    print datarow

import xlrd
file='ch02-xlsxdata.xlsx'
wb = xlrd.open_workbook(filename=file)
ws = wb.sheet_by_name('Sheet1')
dataset = []
for r in xrange(ws.nrows):
    col = []
    for c in range(ws.ncols):
        col.append(ws.cell(r, c).value)
    dataset.append(col)
from pprint import pprint
pprint(dataset)

… 207152670398435680411695324270531801466959270421533831670088597261315325444920138359697328651524421074004769531360921567802830421421342037064593625911780546 …

import struct
import string
datafile = 'ch02-fixed-width-1M.data'
# this is where we define how to
# understand line of data from the file
mask='9s14s5s'
with open(datafile,'r') as f:
    for line in f:
        fields = struct.Struct(mask).unpack_from(line)
        print 'fields: ',[field.strip()for field in fields]

import csv
filename = 'ch02-data.tab'
data = []
try:
    with open(filename) as f:
        reader = csv.reader(f, dialect=csv.excel_tab)
        header = reader.next()
        data = [row for row in reader]
except csv.Error as e:
    print "Error reading CSV file at line %s: %s"%(reader.line_num, e)
    sys.exit(-1)
if header:
    print header
    print '==================='
for datarow in data:
    print datarow

import requests
url = 'https://github.com/timeline.json'
r = requests.get(url)
json_obj = r.json()
repos = set()
for entry in json_obj:
    try:
        repos.add(entry['repository']['url'])
    except KeyError as e:
        print "No key %s. Skipping..."%(e)
from pprint import pprint
pprint(repos)

if __name__ =='__main__':
    # parse input arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("import_file",help="Path to a fixed-width data file.")
    parser.add_argument("export_format",help="Export format: json, csv, xlsx.")
    args = parser.parse_args()
    if args.import_file is None:
        print >> sys.stderr,"You must specify path to import from."
        sys.exit(1)
    if args.export_format not in ('csv','json','xlsx'):
        print >> sys.stderr,"You must provide valid export file format."
        sys.exit(1)
    # verify given path is accesible file
    if not os.path.isfile(args.import_file):
        print >> sys.stderr,"Given path is not a file: %s"% args.import_file
        sys.exit(1)
    # read from formated fixed-width file
    data = import_data(args.import_file)
    # export data to specified format
    # to make this Unix-lixe pipe-able
    # we just print to stdout
    print write_data(data, args.export_format)

def write_csv(data):
    '''Transforms data into csv. Returns csv as string.'''
    # Using this to simulate file IO,
    # as csv can only write to files.
    f = StringIO.StringIO()
    writer = csv.writer(f)
    for row in data:
        writer.writerow(row)
    # Get the content of the file-like object
    return f.getvalue()

def write_json(data):
    '''Transforms data into json. Very straightforward.'''
    j = json.dumps(data)
    return j

def write_xlsx(data):
    '''Writes data into xlsx file.'''
    from xlwt import Workbook
    book = Workbook()
    sheet1 = book.add_sheet("Sheet 1")
    row = 0
    for line in data:
        col = 0
        for datum in line:
            print datum
            sheet1.write(row, col, datum)
            col += 1
            row += 1
    # We have hard limit here of 65535 rows
    # that we are able to save in spreadsheet.
    if row > 65535:
        print >> sys.stderr,"Hit limit of # of rows in one sheet (65535)."
        break
    # XLS is special case where we have to
    # save the file and just return 0
    f = StringIO.StringIO()
    book.save(f)
    return f.getvalue()

SELECT ID, Name, Population FROM City ORDER BY Population DESC LIMIT 1000

import sqlite3
import sys
if len(sys.argv)<2:
    print "Error: You must supply at least SQL script."
    print "Usage: %s table.db ./sql-dump.sql"%(sys.argv[0])
    sys.exit(1)
script_path = sys.argv[1]
if len(sys.argv)==3:
    db = sys.argv[2]
else:
    # if DB is not defined
    # create memory database
    db = ":memory:"
try:
    con = sqlite3.connect(db)
    with con:
        cur = con.cursor()
        with open(script_path,'rb') as f:
            cur.executescript(f.read())
except sqlite3.Error as err:
    print "Error occured: %s"% err

import sqlite3
import sys
if len(sys.argv)!=2:
    print "Please specify database file."
    sys.exit(1)
db = sys.argv[1]
try:
    con = sqlite3.connect(db)
    with con:
        cur = con.cursor()
        query = 'SELECT ID, Name, Population FROM City ORDER BY Population DESC LIMIT 1000'
        con.text_factory = str
        cur.execute(query)
        resultset = cur.fetchall()
        # extract column names
        col_names = [cn[0]for cn in cur.description]
        print "%10s %30s %10s"%tuple(col_names)
        print "="*(10+1+30+1+10)
        for row in resultset:
            print "%10s %30s %10s"% row
except sqlite3.Error as err:
    print "[ERROR]:", err

import numpy as np
import matplotlib.pyplot as plt

def is_outlier(points, threshold=3.5):
    """ Returns a boolean array with True if points are outliers and False otherwise.
    Data points with a modified z-score greater than this # value will be classified as outliers."""
    # transform into vector
    if len(points.shape)==1:
        points = points[:,None]
    # compute median value
    median = np.median(points, axis=0)
    # compute diff sums along the axis
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    # compute MAD
    med_abs_deviation = np.median(diff)
    # compute modified Z-score
    # http://www.itl.nist.gov/div898/handbook/eda/section4/eda43.htm#Iglewicz
    modified_z_score = 0.6745* diff / med_abs_deviation
    # return a mask for each outlier
    return modified_z_score > threshold

# Random data
x = np.random.random(100)
# histogram buckets
buckets = 50
# Add in a few outliers
x = np.r_[x,-49,95,100,-100]
# Keep valid data points
# Note here that
# "~" is logical NOT on boolean numpy arrays
filtered = x[~is_outlier(x)]
# plot histograms
plt.figure()
plt.subplot(211)
plt.hist(x, buckets)
plt.xlabel('Raw')
plt.subplot(212)
plt.hist(filtered, buckets)
plt.xlabel('Cleaned')
plt.show()

from pylab import*
# fake up some data
spread= rand(50)*100
center = ones(25)*50
# generate some outliers high and low
flier_high = rand(10)*100+100
flier_low = rand(10)*-100
# merge generated data set
data = concatenate((spread, center, flier_high, flier_low),0)
subplot(311)
# basic plot
# 'gx' defining the outlier plotting properties
boxplot(data,0,'gx')
# compare this with similar scatter plot
subplot(312)
spread_1 = concatenate((spread, flier_high, flier_low),0)
center_1 = ones(70)*25
scatter(center_1, spread_1)
xlim([0,50])
# and with another that is more appropriate for
# scatter plot
subplot(313)
center_2 = rand(70)*50
scatter(center_2, spread_1)
xlim([0,50])
show()

# generate uniform data points
x = 1e6*rand(1000)
y = rand(1000)
figure()
# crate first subplot
subplot(211)
# make scatter plot
scatter(x, y)
# limit x axis
xlim(1e-6,1e6)
# crate second subplot
subplot(212)
# make scatter plot
scatter(x,y)
# but make x axis logarithmic
xscale('log')
# set same x axis limit
xlim(1e-6,1e6)
show()

with open('/tmp/my_big_file','r') as bigfile:
    for line in bigfile:
        # line based operation, like 'print line'

import sys
filename = sys.argv[1]# must pass valid file name
with open(filename,'rb') as hugefile:
    chunksize = 1000
    readable = ''
    # if you want to stop after certain number of blocks
    # put condition in the while
    while hugefile:
        # if you want to start not from 1st byte
        # do a hugefile.seek(skipbytes) to skip
        # skipbytes of bytes from the file start
        start = hugefile.tell()
        print "starting at:", start
        file_block = ''
        # holds chunk_size of lines
        for _ in xrange(start, start + chunksize):
            line = hugefile.next()
            file_block = file_block + line
            print 'file_block',type(file_block), file_block
            readable = readable + file_block
        # tell where are we in file
        # file IO is usually buffered so tell()
        # will not be precise for every read.
        stop = hugefile.tell()
        print 'readable',type(readable), readable
        print 'reading bytes from %s to %s'%(start, stop)
        print 'read bytes total:',len(readable)
        # if you want to pause read between chucks
        # uncomment following line
        #raw_input()

$ python ch02-chunk-read.py myhugefile.dat

import time
import os
import sys
if len(sys.argv)!=2:
    print >> sys.stderr,"Please specify filename to read"
    filename = sys.argv[1]
if not os.path.isfile(filename):
    print >> sys.stderr,"Given file: \"%s\" is not a file"% filename
    with open(filename,'r') as f:
        # Move to the end of file
        filesize = os.stat(filename)[6]
        f.seek(filesize)
        # endlessly loop
        while True:
            where = f.tell()
            # try reading a line
            line = f.readline()
            # if empty, go back
            if not line:
                time.sleep(1)
                f.seek(where)
            else:
                # , at the end prevents print to add newline, as readline()
                # already read that.
                print line,

import scipy.misc
import matplotlib.pyplot as plt
# load already prepared ndarray from scipy
lena = scipy.misc.lena()
# set the default colormap to gray
plt.gray()
plt.imshow(lena)
plt.colorbar()
plt.show()

print lena.shape
print lena.max()
print lena.dtype

import numpy
import Image
import matplotlib.pyplot as plt
bug = Image.open('stinkbug.png')
arr = numpy.array(bug.getdata(), numpy.uint8).reshape(bug.size[1], bug.size[0],3)
plt.gray()
plt.imshow(arr)
plt.colorbar()
plt.show()

import matplotlib.pyplot as plt
import scipy
import numpy
bug = scipy.misc.imread('stinkbug1.png')
# if you want to inspect the shape of the loaded image
# uncomment following line
#print bug.shape
# the original image is RGB having values for all three
# channels separately. We need to convert that to greyscale image
# by picking up just one channel.
# convert to gray
bug = bug[:,:,0]

>>> a = array(5,1,2,3,4)
>>> a[2:3]
array([2])
>>> a[:2]
array([5,1])
>>> a[3:]
array([3,4])

>>> b = array([[1,1,1],[2,2,2],[3,3,3]])
# matrix 3 x 3
>>> b[0,:]
# pick first row
array([1,1,1])
>>> b[:,0]
# we pick the first column
array([1,2,3])

# show original image
plt.figure()
plt.gray()
plt.subplot(121)
plt.imshow(bug)
# show 'zoomed' region
zbug = bug[100:350,140:350]

plt.subplot(122)
plt.imshow(zbug)
plt.show()

import pylab
import random
SAMPLE_SIZE = 100
# seed random generator
# if no argument provided
# uses system current time
random.seed()
# store generated random values here
real_rand_vars = []
# pick some random values
real_rand_vars = [random.random()for val in xrange(SIZE)]
# create histogram from data in 10 buckets
pylab.hist(real_rand_vars,10)
# define x and y labels
pylab.xlabel("Number range")
pylab.ylabel("Count")
# show figure
pylab.show()

import pylab
import random
# days to generate data for
duration = 100
# mean value
mean_inc = 0.2
# standard deviation
std_dev_inc = 1.2
# time series
x = range(duration)
y = []
price_today = 0
for i in x:
    next_delta = random.normalvariate(mean_inc, std_dev_inc)
    price_today += next_delta
    y.append(price_today)
pylab.plot(x,y)
pylab.xlabel("Time")
pylab.xlabel("Time")
pylab.ylabel("Value")
pylab.show()

# coding: utf-8
import random
import matplotlib
import matplotlib.pyplot as plt
SAMPLE_SIZE = 1000
# histogram buckets
buckets = 100
plt.figure()
# we need to update font size just for this example
matplotlib.rcParams.update({'font.size':7})

plt.subplot(621)
plt.xlabel("random.random")
# Return the next random floating point number in the range [0.0, 1.0).
res = [random.random()for _ in xrange(1, SAMPLE_SIZE)]
plt.hi

plt.subplot(622)
plt.xlabel("random.uniform")
# Return a random floating point number N such that a <= N <= b for a <= b and b <= N <= a for b < a.
# The end-point value b may or may not be included in the range depending on floating-point rounding in the equation a + (b-a) * random().
a = 1
b = SAMPLE_SIZE
res = [random.uniform(a, b)for _ in xrange(1, SAMPLE_SIZE)]
plt.hist(res, buckets)

plt.subplot(623)
plt.xlabel("random.triangular")
# Return a random floating point number N such that low <= N <= high and with the specified # mode between those bounds. The low and high bounds default to zero and one. The mode # argument defaults to the midpoint between the bounds, giving a symmetric distribution.
low = 1
high = SAMPLE_SIZE
res = [random.triangular(low, high)for _ in xrange(1, SAMPLE_SIZE)]
plt.hist(res, buckets)

plt.subplot(624)
plt.xlabel("random.betavariate")
alpha = 1
beta = 10
res = [random.betavariate(alpha, beta)for _ in xrange(1, SAMPLE_SIZE)]
plt.hist(res, buckets)

plt.subplot(625)
plt.xlabel("random.expovariate")
lambd = 1.0/((SAMPLE_SIZE +1)/2.)
res = [random.expovariate(lambd)for _ in xrange(1, SAMPLE_SIZE)]
plt.hist(res, buckets)

plt.subplot(626)
plt.xlabel("random.gammavariate")
alpha = 1
beta = 10
res = [random.gammavariate(alpha, beta)for _ in xrange(1, SAMPLE_SIZE)]
plt.hist(res, buckets)

plt.subplot(627)
plt.xlabel("random.lognormvariate")
mu = 1
sigma = 0.5
res = [random.lognormvariate(mu, sigma)for _ in xrange(1, SAMPLE_SIZE)]
plt.hist(res, buckets)

plt.subplot(628)
plt.xlabel("random.normalvariate")
mu = 1
sigma = 0.5
res = [random.normalvariate(mu, sigma)for _ in xrange(1, SAMPLE_SIZE)]
plt.hist(res, buckets)

plt.subplot(629)
plt.xlabel("random.paretovariate")
alpha = 1
res = [random.paretovariate(alpha)for _ in xrange(1, SAMPLE_SIZE)]
plt.hist(res, buckets)
plt.tight_layout()
plt.show()

import random
with open('/usr/share/dict/words','rt') as f:
    words = f.readlines()
    words = [w.rstrip()for w in words]
    for w in random.sample(words,5):
        print w

from pylab import*
from numpy import*

def moving_average(interval, window_size):
    '''Compute convoluted window for given size '''
    window = ones(int(window_size))/float(window_size)
    return convolve(interval, window,'same')

t = linspace(-4,4,100)
y = sin(t)+ randn(len(t))*0.1
plot(t, y,"k.")
# compute moving average
y_av = moving_average(y,10)
plot(t, y_av,"r")
#xlim(0,1000)
xlabel("Time")
ylabel("Value")
grid(True)
show()

import numpy
from numpy import*
from pylab import*
# possible window type
WINDOWS = ['flat','hanning','hamming','bartlett','blackman']
# if you want to see just two window type, comment previous line,
# and uncomment the following one
# WINDOWS = ['flat', 'hanning']

def smooth(x, window_len=11, window='hanning'):
    """ Smooth the data using a window with requested size. Returns smoothed signal.
    x -- input signal
    window_len -- lenght of smoothing window
    window -- type of window: 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
    flat window will produce a moving average smoothing."""
    if x.ndim !=1:
        raise ValueError,"smooth only accepts 1 dimension arrays."
    if x.size < window_len:
        raise ValueError,"Input vector needs to be bigger than window size."
    if window_len <3:
        return x
    if not window in WINDOWS:
        raise ValueError("Window is one of 'flat', 'hanning', 'hamming', """"'bartlett', 'blackman'")
    # adding reflected windows in front and at the end
    s=numpy.r_[x[window_len-1:0:-1], x, x[-1:-window_len:-1]]
    # pick windows type and do averaging
    if window =='flat':
        #moving average
        w = numpy.ones(window_len,'d')
    else:
        # call appropriate function in numpy
        w =eval('numpy.'+ window +'(window_len)')
    # NOTE: length(output) != length(input), to correct this:
    # return y[(window_len/2-1):-(window_len/2)] instead of just y.
    y = numpy.convolve(w/w.sum(), s, mode='valid')
    return y

# Get some evenly spaced numbers over a specified interval.
t = linspace(-4,4,100)
# Make some noisy sinusoidal
x = sin(t)
xn = x + randn(len(t))*0.1
# Smooth it
y = smooth(x)
# windows
ws = 31
subplot(211)
plot(ones(ws))
# draw on the same axes
hold(True)
# plot for every windows
for w in WINDOWS[1:]:
    eval('plot('+w+'(ws) )')
# configure axis properties
axis([0,30,0,1.1])
# add legend for every window
legend(WINDOWS)
title("Smoothing windows")
# add second plot
subplot(212)
# draw original signal
plot(x)
# and signal with added noise
plot(xn)
# smooth signal with noise for every possible windowing algorithm
for w in WINDOWS:
    plot(smooth(xn,10, w))
# add legend for every graph
l=['original signal','signal with noise']
l.extend(WINDOWS)
legend(l)
title("Smoothed signal")
show()

import numpy as np
import pylab as p
import scipy.signal as signal
# get some linear data
x = np.linspace (0,1,101)
# add some noisy signal
x[3::10]=1.5
p.plot(x)
p.plot(signal.medfilt(x,3))
p.plot(signal.medfilt(x,5))
p.legend(['original signal','length 3','length 5'])
p.show ()

In [1]: plot([1,2,3,2,3,2,2,1])
Out[1]: [<matplotlib.lines.Line2D at 0x412fb50>]

$ ipython --pylab

In [2]: plot([4,3,2,1],[1,2,3,4])
Out[2]: [<matplotlib.lines.Line2D at 0x31444d0>]

from matplotlib.pyplot import*
# some simple data
x = [1,2,3,4]
y = [5,4,3,2]
# create new figure
figure()
# divide subplots into 2 x 3 grid
# and select #1
subplot(231)
plot(x, y)
# select #2
subplot(232)
bar(x, y)
# horizontal bar-charts
subplot(233)
barh(x, y)
# create stacked bar charts
subplot(234)
bar(x, y)
# we need more data for stacked bar charts
y1 = [7,8,5,3]
bar(x, y1, bottom=y, color ='r')
# box plot
subplot(235)
boxplot(x)
# scatter plot
subplot(236)
scatter(x,y)
show()

import matplotlib.pyplot as pl
import numpy as np
x = np.linspace(-np.pi, np.pi,256, endpoint=True)
y = np.cos(x)
y1 = np.sin(x)
pl.plot(x,y)
pl.plot(x, y1)
pl.show()

from pylab import*
import numpy as np
# generate uniformly distributed
# 256 points from -pi to pi, inclusive
x = np.linspace(-np.pi, np.pi,256, endpoint=True)
# these are vectorised versions
# of math.cos, and math.sin in built-in Python maths
# compute cos for every x
y = np.cos(x)
# compute sin for every x
y1 = np.sin(x)
# plot cos
plot(x, y)
# plot sin
plot(x, y1)
# define plot title
title("Functions $\sin$ and $\cos$")
# set x limit
xlim(-3.0,3.0)
# set y limit
ylim(-1.0,1.0)
# format ticks at specific values
xticks([-np.pi,-np.pi/2,0, np.pi/2, np.pi],[r'$-\pi$',r'$-\pi/2$',r'$0$',r'$+\pi/2$',r'$+\pi$'])
yticks([-1,0,+1],[r'$-1$',r'$0$',r'$+1$'])
show()

In [1]: axis()
Out[1]: (0.0,1.0,0.0,1.0)

In [2]: l =[-1,1,-10,10]
In [3]: axis(l)
Out[3]: [-1,1,-10,10]

In [3]: axhline()
Out[3]: <matplotlib.lines.Line2D at 0x414ecd0>
In [4]: axvline()
Out[4]: <matplotlib.lines.Line2D at 0x4152490>
In [5]: axhline(4)
Out[5]: <matplotlib.lines.Line2D at 0x4152850>

plot(x, y, linewidth=1.5)

line,= plot(x, y)
line.set_linewidth(1.5)

lines = plot(x, y)
setp(lines,'linewidth',1.5)

setp(lines, linewidth=1.5)

color ='#eeefff'

color =(0.3,0.3,0.4)

title('Title in a custom color', color='#123456')

subplot(111, axisbg=(0.1843,0.3098,0.3098))

from pylab import*
# get current axis
ax = gca()
# set view to tight, and maximum number of tick intervals to 10
ax.locator_params(tight=True, nbins =10)
# generate 100 normal distribution values
ax.plot(np.random.normal(10,.1,100))
show()

ax.xaxis.set_major_locator(matplotlib.ticker.MultipleLocator(10))

from pylab import*
import matplotlib as mpl
import datetime
fig = figure()
# get current axis
ax = gca()
# set some daterange
start = datetime.datetime(2013,01,01)
stop = datetime.datetime(2013,12,31)
delta = datetime.timedelta(days =1)
# convert dates for matplotlib
dates = mpl.dates.drange(start, stop, delta)
# generate some random values
values = np.random.rand(len(dates))
ax = gca()
# create plot with dates
ax.plot_date(dates, values, linestyle='-', marker='')
# specify formater
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
# apply formater
ax.xaxis.set_major_formatter(date_format)
# autoformat date labels
# rotates labels by 30 degrees by default
# use rotate param to specify different rotation degree
# use bottom param to give more room to date labels
fig.autofmt_xdate()
show()

from matplotlib.pyplot import*
# generate different normal distributions
x1 = np.random.normal(30,3,100)
x2 = np.random.normal(20,2,100)
x3 = np.random.normal(10,3,100)
# plot them
plot(x1, label='plot')
plot(x2, label='2nd plot')
plot(x3, label='last plot')
# generate a legend box
legend(bbox_to_anchor=(0.,1.02,1.,.102), loc=3, ncol=3, mode="expand", borderaxespad=0.)
# annotate an important value
annotate("Important value",(55,20), xycoords='data', xytext=(5,38), arrowprops=dict(arrowstyle='->'))
show()

import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(-np.pi, np.pi,500, endpoint=True)
y = np.sin(x)
plt.plot(x, y)
ax = plt.gca()
# hide two spines
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
# move bottom and left spine to 0,0
ax.spines['bottom'].set_position(('data',0))
ax.spines['left'].set_position(('data',0))
# move ticks positions
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
plt.show()

import numpy as np
import matplotlib.pyplot as plt
mu = 100
sigma = 15
x = np.random.normal(mu, sigma,10000)
ax = plt.gca()
# the histogram of the data
ax.hist(x, bins=35, color='r')
ax.set_xlabel('Values')
ax.set_ylabel('Frequency')
ax.set_title(r'$\mathrm{Histogram:}\ \mu=%d,\ \sigma=%d$'%(mu, sigma))
plt.show()

import numpy as np
import matplotlib.pyplot as plt
# generate number of measurements
x = np.arange(0,10,1)
# values computed from "measured"
y = np.log(x)
# add some error samples from standard normal distribution
xe = 0.1* np.abs(np.random.randn(len(y)))
# draw and show errorbar
plt.bar(x, y, yerr=xe, width=0.4, align='center', ecolor='r', color='cyan', label='experiment #1');
# give some explainations
plt.xlabel('# measurement')
plt.ylabel('Measured values')
plt.title('Measurements')
plt.legend(loc='upper left')
plt.show()

from pylab import*
# make a square figure and axes
figure(1, figsize=(6,6))
ax = axes([0.1,0.1,0.8,0.8])
# the slices will be ordered
# and plotted counter-clockwise.
labels ='Spring','Summer','Autumn','Winter'
# fractions are either x/sum(x) or x if sum(x) <= 1
x =[15,30,45,10]
# explode must be len(x) sequence or None
explode=(0.1,0.1,0.1,0.1)
pie(x, explode=explode, labels=labels, autopct='%1.1f%%', startangle=67)
title('Rainy days by season')
show()

from matplotlib.pyplot import figure, show, gca
import numpy as np
x = np.arange(0.0,2,0.01)
# two different signals are measured
y1 = np.sin(2*np.pi*x)
y2 = 1.2*np.sin(4*np.pi*x)
fig = figure()
ax = gca()
# plot and
# fill between y1 and y2 where a logical condition is met
ax.plot(x, y1, x, y2, color='black')
ax.fill_between(x, y1, y2, where=y2>=y1, facecolor='darkblue', interpolate=True)
ax.fill_between(x, y1, y2, where=y2<=y1, facecolor='deeppink', interpolate=True)
ax.set_title('filled between')
show()

import matplotlib.pyplot as plt
import numpy as np
# generate x values
x = np.random.randn(1000)
# random measurements, no correlation
y1 = np.random.randn(len(x))
# strong correlation
y2 = 1.2+ np.exp(x)
ax1 = plt.subplot(121)
plt.scatter(x, y1, color='indigo', alpha=0.3, edgecolors='white', label='no correl')
plt.xlabel('no correlation')
plt.grid(True)
plt.legend()
ax2 = plt.subplot(122, sharey=ax1, sharex=ax1)
plt.scatter(x, y2, color='green', alpha=0.3, edgecolors='grey', label='correl')
plt.xlabel('strong correlation')
plt.grid(True)
plt.legend()
plt.show()

Python 数据可视化秘籍（一）

Python 数据可视化基础指南

前言

第一章：准备你的工作环境

安装 matplotlib、NumPy 和 SciPy

做好准备

怎么做

安装 virtualenv 和 virtualenvwrapper

怎么做

在 Mac OS X 上安装 matplotlib

怎么做

在 Windows 上安装 matplotlib

怎么做

安装 Python 图像库 (PIL) 进行图像处理

怎么做

安装请求模块

怎么做

在代码中自定义 matplotlib 的参数

怎么做

定制每个项目的 matplotlib 参数

怎么做

它是如何工作的

第二章：了解您的数据

从 CSV 导入数据

怎么做

从微软 Excel 文件导入数据

怎么做

从固定宽度的数据文件导入数据

怎么做

从制表符分隔的文件导入数据

怎么做

从 JSON 资源导入数据

怎么做

将数据导出到 JSON、CSV、Excel

怎么做

从数据库导入数据

怎么做

从异常值中清除数据

怎么做

分块读取文件

怎么做

读取流数据源

怎么做

将图像数据导入 NumPy 数组

怎么做

它是如何工作的

生成受控随机数据集

怎么做

平滑真实数据中的噪声

怎么做

它是如何工作的

第三章：绘制您的第一个绘图并自定义它们

定义绘图类型–条形图、折线图和堆叠图

怎么做

绘制简单的正弦和余弦图

怎么做

定义轴长度和极限

怎么做

它是如何工作的

定义绘图线样式、特性和格式字符串

怎么做

它是如何工作的

颜色

背景颜色

设置刻度、标签和网格

怎么做

注

添加图例和标注

怎么做

它是如何工作的

将脊椎移向中央

怎么做

它是如何工作的

还有更多

制作直方图

怎么做

它是如何工作的

用误差线制作条形图

怎么做

它是如何工作的