# -*- coding: utf-8 -*-
import pickle as p
import numpy as np
import os


def load_CIFAR_batch(filename):
"""Loading a batch of CIFAR datasets"""
with open(filename, 'r') as f:
datadict = p.load(f)
X = datadict['data']
Y = datadict['labels']
X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float")
Y = np.array(Y)
return X, Y


def load_CIFAR10(ROOT):
"""Load all ciFAR data"""
xs = []
ys = []
for b in range(1, 6):
f = os.path.join(ROOT, 'data_batch_%d' % (b,))
X, Y = load_CIFAR_batch(f)
xs.append(X)
ys.append(Y)
Xtr = np.concatenate(xs)
Ytr = np.concatenate(ys)
del X, Y
Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
return Xtr, Ytr, Xte, Yte
Copy the code

The error code is as follows:


'gbk' codec can't decode byte 0x80 in position 0: illegal multibyte sequenceCopy the code

As a result, I began to search for all kinds of questions and asked the big guys, and the answers on the Internet were similar:




But it doesn’t solve the problem! Wrong! (I must have been searching all afternoon and got all the answers above)

Wow, just when I was desperate, I finally found a novel answer, with a try attitude, give it a try:


def load_CIFAR_batch(filename):
"""Loading a batch of CIFAR datasets"""
with open(filename, 'rb') as f:
datadict = p.load(f, encoding='latin1')
X = datadict['data']
Y = datadict['labels']
X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float")
Y = np.array(Y)
return X, YCopy the code

It worked! There are no errors here! Encoding =’latin1′ Encoding =’latin1′ encoding=’latin1′ encoding=’latin1′ So I did some searching and learned:


Latin1 is an alias for ISO-8859-1, or latin-1 in some environments. Iso-8859-1 encoding is single-byte encoding, backward compatible with ASCII, its encoding range is 0x00-0xFF, 0x00-0x7F is completely consistent with ASCII, 0x80-0x9F is the control character, 0xA0-0xFF is the character symbol.





Because the ISO-8859-1 encoding range uses all the space within a single byte, transmitting and storing byte streams of any other encoding on a system that supports ISO-8859-1 is not discarded. In other words, it’s okay to treat byte streams of any other encoding as ISO-8859-1 encoding. This is an important feature, and the MySQL database defaults to Latin1 to take advantage of this feature. ASCII encoding is a 7-bit container, and ISO-8859-1 encoding is an 8-bit container.

Before I was happy, after running, I found another problem:


memory errorCopy the code

What the hell? Memory error! Wow, it’s the size of the data.


X = x.reshape (1000,3, 32, 32).transpose(0,2,3,1).astype("float")Copy the code

This tells us that each batch of data is 10,000 * 3 * 32 * 32, which is equivalent to over 30 million floating points. The float data type is actually the same as float64, meaning that each numeric size is eight bytes. This means that each batch takes up at least 240 MB. You load 6 of these (5 training + 1 test) at a total output of nearly 1.4GB of data.


for b in range(1, 2):
f = os.path.join(ROOT, 'data_batch_%d' % (b,))
X, Y = load_CIFAR_batch(f)
xs.append(X)
ys.append(Y)
Copy the code


So if possible, you can only run one batch at a time as shown in the code above.





At this point, the error is basically fixed, the correct code is posted below:


# -*- coding: utf-8 -*-
import pickle as p
import numpy as np
import os


def load_CIFAR_batch(filename):
"""Loading a batch of CIFAR datasets"""
with open(filename, 'rb') as f:
datadict = p.load(f, encoding='latin1')
X = datadict['data']
Y = datadict['labels']
X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float")
Y = np.array(Y)
return X, Y


def load_CIFAR10(ROOT):
"""Load all ciFAR data"""
xs = []
ys = []
for b in range(1, 2):
f = os.path.join(ROOT, 'data_batch_%d' % (b,))
X, Y = load_CIFAR_batch(f)
xs.append(X) # Batch all together
ys.append(Y)
Xtr = np.concatenate(xs) # make it a row vector, and finally Xtr size is (50000,32,32, 2,3)
Ytr = np.concatenate(ys)
del X, Y
Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
return Xtr, Ytr, Xte, Yte


import numpy as np
from julyedu.data_utils import load_CIFAR10
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (10.0, 8.0) plt.rcparams ['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Load ciFAR-10 dataset
cifar10_dir = 'julyedu/datasets/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

# Look at some samples of the dataset: show some for each category
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
Copy the code

By the way, take a look at ciFAR-10 data composition:




For more content, please pay attention to my personal public number