#! /usr/bin/env python
from random import randint
import time
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
def swap(lst, i, j):
temp = lst[i]
lst[i] = lst[j]
lst[j] = temp
def swap_bitwise(lst, i, j):
lst[i] = lst[i] ^ lst[j]
lst[j] = lst[i] ^ lst[j]
lst[i] = lst[i] ^ lst[j]
def shuffle(lst, swap_func, should_print=False):
for i in xrange(len(lst)):
j = randint(0, len(lst)-1)
swap_func(lst, i, j)
if should_print:
print i, j, lst
# more or less due to Fisher, Yates, Durstenfeld, and Knuth
def shuffle2(lst, swap_func, should_print=False):
for i in range(len(lst))[::-1]:
j = randint(0, i)
swap_func(lst, i, j)
if should_print:
print i, j, lst
def demo(shuffle_func):
lst = range(10)
print 'list before:', lst
print 'shuffling ...'
shuffle_func(lst, swap, True)
print '...done'
print 'list after:', lst
def function_times(fs, maxlen):
res = []
for f in fs:
r = []
for i in xrange(maxlen):
data = range(i)
start = time.time()
shuffle(data, f)
end = time.time()
r.append((i, end-start))
res.append(r)
return res
def function_times2(fs, maxlen):
res = []
for f in fs:
r = []
for i in [1000*i for i in xrange(maxlen)]:
data = range(i)
start = time.time()
shuffle(data, f)
end = time.time()
r.append((i, end-start))
res.append(r)
return res
def plot_prob():
tests = 10000
n_elements = 20
m = np.array([[0.]*n_elements]*n_elements)
for k in xrange(tests):
initial = range(n_elements)
shuffle2(initial, swap)
for i in range(len(initial)):
m[i][initial[i]] += 1
m = m / tests
plt.imshow(m)
plt.colorbar()
plt.title('Color Map')
plt.show()
plt.clf()
#print m.reshape(-1,).tolist()
plt.title('Histogram')
plt.hist(m.reshape(-1,).tolist())
plt.show()
plt.clf()
def plot_times():
functions = [swap, swap_bitwise]
mlen = 1000
res = function_times(functions, mlen)
fig = plt.figure(dpi=90)
ax = fig.add_subplot(111)
for r in res:
ax.plot([rr[0] for rr in r], [rr[1] for rr in r])
ax.legend(['swap', 'bitwise swap'],
loc='upper center',
#bbox_to_anchor=(0.5, 1.05),
ncol=2, fancybox=True, shadow=True)
#ax.set_yscale('log')
plt.title('Shuffle function runtimes versus list length')
plt.xlabel('list length')
plt.ylabel('time (seconds)')
plt.show()
if __name__ == '__main__':
#demo(shuffle)
#plot_times()
plot_prob()
Below is the printout of the demo() function.
Below is a plot of the time to shuffle, the output of the plot_times() function. As you can see, the bitwise swap is a little slower (it takes more time) than the regular swap.
Now, we change the function function_times() to shuffle much larger lists. Since it takes quite a while to iterate through every size 1, 2, ..., n, we will change the list sizes to 1000, 2000, ..., 200000. This is accomplished by changing one line in the function_times function, from
for i in xrange(maxlen):to
for i in [1000*i for i in xrange(maxlen)]:This functionality is incorporated into the function_times2() function. Here is the plot:
Notice the fluctuations near x = 150,000. This is most likely due to the fact that the list is so large that it cannot fit into cache, i.e., elements of the list are being read from RAM. If we continue to increase the list size, there will be another spike where the list is being read from disk, which is even slower. If this doesn't make sense to you, look up 'paging' and the 'principle of locality' to see how data is moved between cache, RAM, and disk.
Finally, let's compare shuffle() and shuffle2(). There are subtle differences that you can see, but let's look at the output in two ways: a colormap and a histogram. First, the colormap for shuffle():
Since we are using 20 values, the probability that i goes to j should be 1/20 = 0.05. There is, however, a non-uniform distribution of values. The histogram below confirms this: if there was no bias, the values would assume a normal (gaussian) distribution around 0.05, but the data is skewed right.
The histogram for shuffle():
Here's the colormap for shuffle2():
This looks much more uniform. Indeed, as the histogram for shuffle2() function below shows, the data is almost normally distributed. Notice the nice round shape and lack of (or less) skew.
For this reason, I recommend using shuffle2().












