Introduction to Importing Data in Python

By Salerno | March 19, 2020

1. Importing entire text files


# Open a file: file
file = open('c:/blogdown/moby_dick.txt', mode='r')

# Print it
print(file.read())

# Check whether file is closed
## CHAPTER 1. Loomings.
## 
## Call me Ishmael. Some years ago--never mind how long precisely--having
## little or no money in my purse, and nothing particular to interest me on
## shore, I thought I would sail about a little and see the watery part of
## the world. It is a way I have of driving off the spleen and regulating
## the circulation. Whenever I find myself growing grim about the mouth;
## whenever it is a damp, drizzly November in my soul; whenever I find
## myself involuntarily pausing before coffin warehouses, and bringing up
## the rear of every funeral I meet; and especially whenever my hypos get
## such an upper hand of me, that it requires a strong moral principle to
## prevent me from deliberately stepping into the street, and methodically
## knocking people's hats off--then, I account it high time to get to sea
## as soon as I can. This is my substitute for pistol and ball. With a
## philosophical flourish Cato throws himself upon his sword; I quietly
## take to the ship. There is nothing surprising in this. If they but knew
## it, almost all men in their degree, some time or other, cherish very
## nearly the same feelings towards the ocean with me.
print(file.closed)

# Close file
## False
file.close()

# Check whether file is closed
print(file.closed)
## True

2. Importing text files line by line


# Read & print the first 3 lines
with open('c:/blogdown/moby_dick.txt') as file:
    print(file.readline())
    print(file.readline())
    print(file.readline())
## CHAPTER 1. Loomings.
## 
## 
## 
## Call me Ishmael. Some years ago--never mind how long precisely--having

3. Using NumPy to import flat files


# Import package
import numpy as np
import matplotlib.pyplot as plt

# Assign filename to variable: file
file = 'c:/blogdown/mnist_kaggle_some_rows.csv'

# Load file as array: digits
digits = np.loadtxt(file, delimiter=',')

# Print datatype of digits
print(type(digits))

# Select and reshape a row
## <class 'numpy.ndarray'>
im = digits[21, 1:]
im_sq = np.reshape(im, (28, 28))
# Plot reshaped data (matplotlib.pyplot already loaded as plt)
plt.imshow(im_sq, cmap='Greys', interpolation='nearest')
plt.show()
## Traceback (most recent call last):
##   File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\backends\backend_qt5.py", line 508, in _draw_idle
##     self.draw()
##   File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py", line 388, in draw
##     self.figure.draw(self.renderer)
##   File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\artist.py", line 38, in draw_wrapper
##     return draw(artist, renderer, *args, **kwargs)
##   File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\figure.py", line 1709, in draw
##     renderer, self, artists, self.suppressComposite)
##   File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\image.py", line 135, in _draw_list_compositing_images
##     a.draw(renderer)
##   File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\artist.py", line 38, in draw_wrapper
##     return draw(artist, renderer, *args, **kwargs)
##   File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_base.py", line 2607, in draw
##     self._update_title_position(renderer)
##   File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_base.py", line 2548, in _update_title_position
##     ax.xaxis.get_ticks_position() in choices):
##   File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axis.py", line 2146, in get_ticks_position
##     self._get_ticks_position()]
##   File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axis.py", line 1833, in _get_ticks_position
##     minor = self.minorTicks[0]
## IndexError: list index out of range

4. Customizing your NumPy import


# Assign filename: file
file = 'c:/blogdown/seaslug.txt'

# Import file: data
data = np.loadtxt(file, delimiter='\t', dtype=str)

# Print the first element of data
print(data[0])

# Import data as floats and skip the first row: data_float
## ['Time' 'Percent']
data_float = np.loadtxt(file, delimiter='\t', dtype=float, skiprows=1)

# Print the 10th element of data_float
print(data_float[9])

# Plot a scatterplot of the data
## [0.    0.357]
plt.scatter(data_float[:, 0], data_float[:, 1])
plt.xlabel('time (min.)')
plt.ylabel('percentage of larvae')
plt.show()

5. Working with mixed datatypes


data = 'c:/blogdown/titanic_sub.csv'

data = np.genfromtxt(data, delimiter=',', names=True, dtype=None)
## C:/ProgramData/Anaconda3/python.exe:1: VisibleDeprecationWarning: Reading unicode strings without specifying the encoding argument is deprecated. Set the encoding, use None for the system default.
np.shape(data)
## (891,)
print(data['Fare'])
## [  7.25    71.2833   7.925   53.1      8.05     8.4583  51.8625  21.075
##   11.1333  30.0708  16.7     26.55     8.05    31.275    7.8542  16.
##   29.125   13.      18.       7.225   26.      13.       8.0292  35.5
##   21.075   31.3875   7.225  263.       7.8792   7.8958  27.7208 146.5208
##    7.75    10.5     82.1708  52.       7.2292   8.05    18.      11.2417
##    9.475   21.       7.8958  41.5792   7.8792   8.05    15.5      7.75
##   21.6792  17.8     39.6875   7.8     76.7292  26.      61.9792  35.5
##   10.5      7.2292  27.75    46.9      7.2292  80.      83.475   27.9
##   27.7208  15.2458  10.5      8.1583   7.925    8.6625  10.5     46.9
##   73.5     14.4542  56.4958   7.65     7.8958   8.05    29.      12.475
##    9.       9.5      7.7875  47.1     10.5     15.85    34.375    8.05
##  263.       8.05     8.05     7.8542  61.175   20.575    7.25     8.05
##   34.6542  63.3583  23.      26.       7.8958   7.8958  77.2875   8.6542
##    7.925    7.8958   7.65     7.775    7.8958  24.15    52.      14.4542
##    8.05     9.825   14.4583   7.925    7.75    21.     247.5208  31.275
##   73.5      8.05    30.0708  13.      77.2875  11.2417   7.75     7.1417
##   22.3583   6.975    7.8958   7.05    14.5     26.      13.      15.0458
##   26.2833  53.1      9.2167  79.2     15.2458   7.75    15.85     6.75
##   11.5     36.75     7.7958  34.375   26.      13.      12.525   66.6
##    8.05    14.5      7.3125  61.3792   7.7333   8.05     8.6625  69.55
##   16.1     15.75     7.775    8.6625  39.6875  20.525   55.      27.9
##   25.925   56.4958  33.5     29.125   11.1333   7.925   30.6958   7.8542
##   25.4667  28.7125  13.       0.      69.55    15.05    31.3875  39.
##   22.025   50.      15.5     26.55    15.5      7.8958  13.      13.
##    7.8542  26.      27.7208 146.5208   7.75     8.4042   7.75    13.
##    9.5     69.55     6.4958   7.225    8.05    10.4625  15.85    18.7875
##    7.75    31.       7.05    21.       7.25    13.       7.75   113.275
##    7.925   27.      76.2917  10.5      8.05    13.       8.05     7.8958
##   90.       9.35    10.5      7.25    13.      25.4667  83.475    7.775
##   13.5     31.3875  10.5      7.55    26.      26.25    10.5     12.275
##   14.4542  15.5     10.5      7.125    7.225   90.       7.775   14.5
##   52.5542  26.       7.25    10.4625  26.55    16.1     20.2125  15.2458
##   79.2     86.5    512.3292  26.       7.75    31.3875  79.65     0.
##    7.75    10.5     39.6875   7.775  153.4625 135.6333  31.       0.
##   19.5     29.7      7.75    77.9583   7.75     0.      29.125   20.25
##    7.75     7.8542   9.5      8.05    26.       8.6625   9.5      7.8958
##   13.       7.75    78.85    91.0792  12.875    8.85     7.8958  27.7208
##    7.2292 151.55    30.5    247.5208   7.75    23.25     0.      12.35
##    8.05   151.55   110.8833 108.9     24.      56.9292  83.1583 262.375
##   26.       7.8958  26.25     7.8542  26.      14.     164.8667 134.5
##    7.25     7.8958  12.35    29.      69.55   135.6333   6.2375  13.
##   20.525   57.9792  23.25    28.5    153.4625  18.     133.65     7.8958
##   66.6    134.5      8.05    35.5     26.     263.      13.      13.
##   13.      13.      13.      16.1     15.9      8.6625   9.225   35.
##    7.2292  17.8      7.225    9.5     55.      13.       7.8792   7.8792
##   27.9     27.7208  14.4542   7.05    15.5      7.25    75.25     7.2292
##    7.75    69.3     55.4417   6.4958   8.05   135.6333  21.075   82.1708
##    7.25   211.5      4.0125   7.775  227.525   15.7417   7.925   52.
##    7.8958  73.5     46.9     13.       7.7292  12.     120.       7.7958
##    7.925  113.275   16.7      7.7958   7.8542  26.      10.5     12.65
##    7.925    8.05     9.825   15.85     8.6625  21.       7.75    18.75
##    7.775   25.4667   7.8958   6.8583  90.       0.       7.925    8.05
##   32.5     13.      13.      24.15     7.8958   7.7333   7.875   14.4
##   20.2125   7.25    26.      26.       7.75     8.05    26.55    16.1
##   26.       7.125   55.9    120.      34.375   18.75   263.      10.5
##   26.25     9.5      7.775   13.       8.1125  81.8583  19.5     26.55
##   19.2583  30.5     27.75    19.9667  27.75    89.1042   8.05     7.8958
##   26.55    51.8625  10.5      7.75    26.55     8.05    38.5     13.
##    8.05     7.05     0.      26.55     7.725   19.2583   7.25     8.6625
##   27.75    13.7917   9.8375  52.      21.       7.0458   7.5208  12.2875
##   46.9      0.       8.05     9.5875  91.0792  25.4667  90.      29.7
##    8.05    15.9     19.9667   7.25    30.5     49.5042   8.05    14.4583
##   78.2667  15.1    151.55     7.7958   8.6625   7.75     7.6292   9.5875
##   86.5    108.9     26.      26.55    22.525   56.4958   7.75     8.05
##   26.2875  59.4      7.4958  34.0208  10.5     24.15    26.       7.8958
##   93.5      7.8958   7.225   57.9792   7.2292   7.75    10.5    221.7792
##    7.925   11.5     26.       7.2292   7.2292  22.3583   8.6625  26.25
##   26.55   106.425   14.5     49.5     71.      31.275   31.275   26.
##  106.425   26.      26.      13.8625  20.525   36.75   110.8833  26.
##    7.8292   7.225    7.775   26.55    39.6    227.525   79.65    17.4
##    7.75     7.8958  13.5      8.05     8.05    24.15     7.8958  21.075
##    7.2292   7.8542  10.5     51.4792  26.3875   7.75     8.05    14.5
##   13.      55.9     14.4583   7.925   30.     110.8833  26.      40.125
##    8.7125  79.65    15.      79.2      8.05     8.05     7.125   78.2667
##    7.25     7.75    26.      24.15    33.       0.       7.225   56.9292
##   27.       7.8958  42.4      8.05    26.55    15.55     7.8958  30.5
##   41.5792 153.4625  31.275    7.05    15.5      7.75     8.05    65.
##   14.4     16.1     39.      10.5     14.4542  52.5542  15.7417   7.8542
##   16.1     32.3208  12.35    77.9583   7.8958   7.7333  30.       7.0542
##   30.5      0.      27.9     13.       7.925   26.25    39.6875  16.1
##    7.8542  69.3     27.9     56.4958  19.2583  76.7292   7.8958  35.5
##    7.55     7.55     7.8958  23.       8.4333   7.8292   6.75    73.5
##    7.8958  15.5     13.     113.275  133.65     7.225   25.5875   7.4958
##    7.925   73.5     13.       7.775    8.05    52.      39.      52.
##   10.5     13.       0.       7.775    8.05     9.8417  46.9    512.3292
##    8.1375  76.7292   9.225   46.9     39.      41.5792  39.6875  10.1708
##    7.7958 211.3375  57.      13.4167  56.4958   7.225   26.55    13.5
##    8.05     7.7333 110.8833   7.65   227.525   26.2875  14.4542   7.7417
##    7.8542  26.      13.5     26.2875 151.55    15.2458  49.5042  26.55
##   52.       9.4833  13.       7.65   227.525   10.5     15.5      7.775
##   33.       7.0542  13.      13.      53.1      8.6625  21.       7.7375
##   26.       7.925  211.3375  18.7875   0.      13.      13.      16.1
##   34.375  512.3292   7.8958   7.8958  30.      78.85   262.375   16.1
##    7.925   71.      20.25    13.      53.1      7.75    23.      12.475
##    9.5      7.8958  65.      14.5      7.7958  11.5      8.05    86.5
##   14.5      7.125    7.2292 120.       7.775   77.9583  39.6      7.75
##   24.15     8.3625   9.5      7.8542  10.5      7.225   23.       7.75
##    7.75    12.475    7.7375 211.3375   7.2292  57.      30.      23.45
##    7.05     7.25     7.4958  29.125   20.575   79.2      7.75    26.
##   69.55    30.6958   7.8958  13.      25.9292   8.6833   7.2292  24.15
##   13.      26.25   120.       8.5167   6.975    7.775    0.       7.775
##   13.      53.1      7.8875  24.15    10.5     31.275    8.05     0.
##    7.925   37.0042   6.45    27.9     93.5      8.6625   0.      12.475
##   39.6875   6.95    56.4958  37.0042   7.75    80.      14.4542  18.75
##    7.2292   7.8542   8.3     83.1583   8.6625   8.05    56.4958  29.7
##    7.925   10.5     31.       6.4375   8.6625   7.55    69.55     7.8958
##   33.      89.1042  31.275    7.775   15.2458  39.4     26.       9.35
##  164.8667  26.55    19.2583   7.2292  14.1083  11.5     25.9292  69.55
##   13.      13.      13.8583  50.4958   9.5     11.1333   7.8958  52.5542
##    5.       9.      24.       7.225    9.8458   7.8958   7.8958  83.1583
##   26.       7.8958  10.5167  10.5      7.05    29.125   13.      30.
##   23.45    30.       7.75  ]

6. Working with mixed datatypes (2)


data = 'c:/blogdown/titanic_sub.csv'

# Import file using np.recfromcsv: d
d = np.recfromcsv(data)

# Print out first three entries of d
## C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\npyio.py:2372: VisibleDeprecationWarning: Reading unicode strings without specifying the encoding argument is deprecated. Set the encoding, use None for the system default.
##   output = genfromtxt(fname, **kwargs)
print(d[:3])
## [(1, 0, 3, b'male', 22., 1, 0, b'A/5 21171',  7.25  , b'', b'S')
##  (2, 1, 1, b'female', 38., 1, 0, b'PC 17599', 71.2833, b'C85', b'C')
##  (3, 1, 3, b'female', 26., 0, 0, b'STON/O2. 3101282',  7.925 , b'', b'S')]

7. Using Pandas to import flatfiles as DataFrames


# Import pandas as pd
import pandas as pd

# Assign the filename: file
file = 'c:/blogdown/titanic_sub.csv'

# Read the file into a DataFrame: df
df = pd.read_csv(file)

# View the head of the DataFrame
print(df.head())
##    PassengerId  Survived  Pclass  ...     Fare  Cabin  Embarked
## 0            1         0       3  ...   7.2500    NaN         S
## 1            2         1       1  ...  71.2833    C85         C
## 2            3         1       3  ...   7.9250    NaN         S
## 3            4         1       1  ...  53.1000   C123         S
## 4            5         0       3  ...   8.0500    NaN         S
## 
## [5 rows x 11 columns]
type(df)
## <class 'pandas.core.frame.DataFrame'>

8. Using pandas to import flat files as DataFrames (2)


# Assign the filename: file
file = 'c:/blogdown/titanic_sub.csv'

# Read the first 5 rows of the file into a DataFrame: data
data = pd.read_csv(file, nrows=5, header=None)

# Build a numpy array from the DataFrame: data_array
data_array = np.array(data)

# Print the datatype of data_array to the shell
print(type(data_array))
## <class 'numpy.ndarray'>

9. Customizing your pandas import


# Import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Assign filename: file
file = 'c:/blogdown/titanic_corrupt.txt'

# Import file: data
data = pd.read_csv(file, sep='\t', comment='#', na_values='Nothing')

# Print the head of the DataFrame
print(data.head())

# Plot 'Age' variable in a histogram
##    PassengerId  Survived  Pclass  ...     Fare  Cabin    Embarked
## 0            1         0       3  ...   7.2500    NaN  S #dfafdad
## 1            2         1       1  ...  71.2833    C85           C
## 2            3         1       3  ...   7.9250    NaN           S
## 3            4         1       1  ...  53.1000   C123           S
## 4            5         0       3  ...   8.0500    NaN           S
## 
## [5 rows x 11 columns]
pd.DataFrame.hist(data[['Age']])
## array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000002025C248>]],
##       dtype=object)
plt.xlabel('Age (years)')
plt.ylabel('count')
plt.show()

10. Not so flat any more


import os
wd = os.getcwd()
os.listdir(wd)
## ['2015-07-23-r-rmarkdown.html', '2015-07-23-r-rmarkdown.Rmd', '2019-12-18-linear-regression.html', '2019-12-18-linear-regression.Rmd', '2019-12-19-data-frame.html', '2019-12-19-data-frame.Rmd', '2019-12-21-exponential-smoothing-model.html', '2019-12-21-exponential-smoothing-model.Rmd', '2019-12-23-r-packages-for-regression.html', '2019-12-23-r-packages-for-regression.Rmd', '2019-12-23-random-forest.html', '2019-12-23-random-forest.Rmd', '2019-12-27-quicksort-algorithm.html', '2019-12-27-quicksort-algorithm.Rmd', '2019-12-28-binary-search.html', '2019-12-28-binary-search.Rmd', '2020-01-05-knn-algorithm.html', '2020-01-05-knn-algorithm.Rmd', '2020-01-13-classifying-using-logistic-regression.html', '2020-01-13-classifying-using-logistic-regression.Rmd', '2020-01-19-correlation-and-regression.html', '2020-01-19-correlation-and-regression.Rmd', '2020-02-07-german-credit-and-regression-tree.html', '2020-02-07-german-credit-and-regression-tree.Rmd', '2020-02-29-a-game-of-chance.html', '2020-02-29-a-game-of-chance.Rmd', '2020-03-01-functions.html', '2020-03-01-functions.Rmd', '2020-03-09-iterables-versus-iterators.html', '2020-03-09-iterables-versus-iterators.Rmd', '2020-03-14-linear-models-scikit-learn.html', '2020-03-14-linear-models-scikit-learn.Rmd', '2020-03-14-tensorflow-2-quickstart-for-beginners.html', '2020-03-14-tensorflow-2-quickstart-for-beginners.Rmd', '2020-03-15-python-data-science-part-1.html', '2020-03-15-python-data-science-part-1.Rmd', '2020-03-18-supervised-learning-with-scikit-learn.html', '2020-03-18-supervised-learning-with-scikit-learn.Rmd', '2020-03-19-introduction-to-importing-data-in-python.html', '2020-03-19-introduction-to-importing-data-in-python.Rmd', '2020-03-19-introduction-to-importing-data-in-python_files', 'Chinook.sqlite', 'creating-a-new-theme.md', 'go-is-for-lovers.md', 'heart-disease-uci.zip', 'heart.csv', 'hugo-is-for-lovers.md', 'linked-post.md', 'migrate-from-jekyll.md', 'Northwind.sqlite', '_book', '_bookdown_files']

11. Listing sheets in Excel files


# Import pandas
import pandas as pd

# Assign spreadsheet filename: file
file = 'c:/blogdown/battledeath.xlsx'

# Load spreadsheet: xls
xls = pd.ExcelFile(file)

# Print sheet names
print(xls.sheet_names)
## ['2002', '2004']

12. Importing sheets from Excel files


# Assign spreadsheet filename: file
file = 'c:/blogdown/battledeath.xlsx'

# Load spreadsheet: xls
xls = pd.ExcelFile(file)

# Load a sheet into a DataFrame by name: df1
file1 = xls.parse('2004')

# Print the head of the DataFrame df1
print(file1.head())

# Load a sheet into a DataFrame by index: df2
##   War(country)      2004
## 0  Afghanistan  9.451028
## 1      Albania  0.130354
## 2      Algeria  3.407277
## 3      Andorra  0.000000
## 4       Angola  2.597931
file2 = xls.parse(0)

# Print the head of the DataFrame df2
print(file2.head())
##   War, age-adjusted mortality due to       2002
## 0                        Afghanistan  36.083990
## 1                            Albania   0.128908
## 2                            Algeria  18.314120
## 3                            Andorra   0.000000
## 4                             Angola  18.964560

13. Customizing your spreadsheet import


# Assign spreadsheet filename: file
file = 'c:/blogdown/battledeath.xlsx'

# Load spreadsheet: xls
xls = pd.ExcelFile(file)

# Parse the first sheet and rename the columns: df1
df1 = xls.parse(0, skiprows=1, names=('Country', 'AAM due to War (2002)'))

# Print the head of the DataFrame df1
print(df1.head())

# Parse the first column of the second sheet and rename the column: df2
##                Country  AAM due to War (2002)
## 0              Albania               0.128908
## 1              Algeria              18.314120
## 2              Andorra               0.000000
## 3               Angola              18.964560
## 4  Antigua and Barbuda               0.000000
df2 = xls.parse(1, usecols=[0], skiprows=[0], names=['Country'])

# Print the head of the DataFrame df2
print(df2.head())
##                Country
## 0              Albania
## 1              Algeria
## 2              Andorra
## 3               Angola
## 4  Antigua and Barbuda

14. Importing SAS files


# Import sas7bdat package
from sas7bdat import SAS7BDAT 

# Save file to a DataFrame: df_sas
with SAS7BDAT('c:/blogdown/sales.sas7bdat') as file:
    df_sas = file.to_data_frame()

# Print head of DataFrame
print(df_sas.head())

# Plot histogram of DataFrame features (pandas and pyplot already imported)
##      YEAR     P           S
## 0  1950.0  12.9  181.899994
## 1  1951.0  11.9  245.000000
## 2  1952.0  10.7  250.199997
## 3  1953.0  11.3  265.899994
## 4  1954.0  11.2  248.500000
pd.DataFrame.hist(df_sas[['P']])
## array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000002C1DBB88>]],
##       dtype=object)
plt.ylabel('count')
plt.show()

15. Importing Stata files


# Import pandas
import pandas as pd

# Load Stata file into a pandas DataFrame: df
df = pd.read_stata('c:/blogdown/disarea.dta')

# Print the head of the DataFrame df
print(df.head())

# Plot histogram of one column of the DataFrame
##   wbcode               country  disa1  disa2  ...  disa22  disa23  disa24  disa25
## 0    AFG           Afghanistan   0.00   0.00  ...    0.00    0.02    0.00    0.00
## 1    AGO                Angola   0.32   0.02  ...    0.99    0.98    0.61    0.00
## 2    ALB               Albania   0.00   0.00  ...    0.00    0.00    0.00    0.16
## 3    ARE  United Arab Emirates   0.00   0.00  ...    0.00    0.00    0.00    0.00
## 4    ARG             Argentina   0.00   0.24  ...    0.00    0.01    0.00    0.11
## 
## [5 rows x 27 columns]
pd.DataFrame.hist(df[['disa10']])
## array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000002C1DB648>]],
##       dtype=object)
plt.xlabel('Extent of disease')
plt.ylabel('Number of countries')
plt.show()

16. Using h5py to import HDF5 files


# Import packages
import numpy as np
import h5py

# Assign filename: file
file = 'c:/blogdown/L-L1_LOSC_4_V1-1126259446-32.hdf5'

# Load file: data
data = h5py.File(file, 'r')

# Print the datatype of the loaded file
print(type(data))

# Print the keys of the file
## <class 'h5py._hl.files.File'>
for key in data.keys():
    print(key)
## meta
## quality
## strain

17. Extracting data from your HDF5 file


# Import packages
import numpy as np
import h5py

# Assign filename: file
file = 'c:/blogdown/L-L1_LOSC_4_V1-1126259446-32.hdf5'

# Load file: data
data = h5py.File(file, 'r')


# Get the HDF5 group: group
group = data['strain']

# Check out keys of group
for key in group.keys():
    print(key)

# Set variable equal to time series data: strain
## Strain
strain = data['strain']['Strain'].value

# Set number of time points to sample: num_samples
## C:/ProgramData/Anaconda3/python.exe:1: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
num_samples = 10000

# Set time vector
time = np.arange(0, 1, 1/num_samples)

# Plot data
plt.plot(time, strain[:num_samples])
plt.xlabel('GPS Time (s)')
plt.ylabel('strain')
plt.show()

18. Loading .mat files


# Import package
import scipy.io

# Load MATLAB file: mat
mat = scipy.io.loadmat('c:/blogdown/albeck_gene_expression.mat')

# Print the datatype type of mat
print(type(mat))
## <class 'dict'>

19. The structure of .mat in Python


import scipy.io
import matplotlib.pyplot as plt
import numpy as np


# Load MATLAB file: mat
mat = scipy.io.loadmat('c:/blogdown/albeck_gene_expression.mat')

# Print the keys of the MATLAB dictionary
print(mat.keys())

# Print the type of the value corresponding to the key 'CYratioCyt'
## dict_keys(['__header__', '__version__', '__globals__', 'rfpCyt', 'rfpNuc', 'cfpNuc', 'cfpCyt', 'yfpNuc', 'yfpCyt', 'CYratioCyt'])
print(type(mat['CYratioCyt']))

# Print the shape of the value corresponding to the key 'CYratioCyt'
## <class 'numpy.ndarray'>
print(mat['CYratioCyt'].shape)

# Subset the array and plot it
## (200, 137)
data = mat['CYratioCyt'][25, 5:]
fig = plt.figure()
plt.plot(data)
plt.xlabel('time (min.)')
plt.ylabel('normalized fluorescence (measure of expression)')
plt.show()

20. Creating a database engine


# Import necessary module
from sqlalchemy import create_engine

file = 'sqlite:///c:/blogdown/Chinook.sqlite'

# Create engine: engine
engine = create_engine(file)

table_names = engine.table_names()
print(table_names)
## ['Album', 'Artist', 'Customer', 'Employee', 'Genre', 'Invoice', 'InvoiceLine', 'MediaType', 'Playlist', 'PlaylistTrack', 'Track']

21. The Hello World of SQL Queries!


# Import packages
from sqlalchemy import create_engine
import pandas as pd

file = 'sqlite:///c:/blogdown/Chinook.sqlite'

# Create engine: engine
engine = create_engine(file)

# Open engine connection: con
con = engine.connect()

# Perform query: rs
rs = con.execute('SELECT * FROM Album')

# Save results of the query to DataFrame: df
df = pd.DataFrame(rs.fetchall())

# Close connection
con.close()

# Print head of DataFrame df
print(df.head())
##    0                                      1  2
## 0  1  For Those About To Rock We Salute You  1
## 1  2                      Balls to the Wall  2
## 2  3                      Restless and Wild  2
## 3  4                      Let There Be Rock  1
## 4  5                               Big Ones  3

22. Customizing the Hello World of SQL Queries


# Import packages
from sqlalchemy import create_engine
import pandas as pd

file = 'sqlite:///c:/blogdown/Chinook.sqlite'

# Create engine: engine
engine = create_engine(file)

# Open engine in context manager
# Perform query and save results to DataFrame: df
with engine.connect() as con:
    rs = con.execute("SELECT LastName, Title FROM Employee")
    df = pd.DataFrame(rs.fetchmany(size=3))
    df.columns = rs.keys()

# Print the length of the DataFrame df
print(len(df))

# Print the head of the DataFrame df
## 3
print(df.head())
##   LastName                Title
## 0    Adams      General Manager
## 1  Edwards        Sales Manager
## 2  Peacock  Sales Support Agent

23. Filtering your database records using SQL’s WHERE


# Import packages
from sqlalchemy import create_engine
import pandas as pd

file = 'sqlite:///c:/blogdown/Chinook.sqlite'

# Create engine: engine
engine = create_engine(file)

# Open engine in context manager
# Perform query and save results to DataFrame: df
with engine.connect() as con:
    rs = con.execute("SELECT * FROM Employee WHERE EmployeeId >= 6")
    df = pd.DataFrame(rs.fetchall())
    df.columns = rs.keys()

# Print the head of the DataFrame df
print(df.head())
##    EmployeeId  LastName  ...                Fax                    Email
## 0           6  Mitchell  ...  +1 (403) 246-9899  michael@chinookcorp.com
## 1           7      King  ...  +1 (403) 456-8485   robert@chinookcorp.com
## 2           8  Callahan  ...  +1 (403) 467-8772    laura@chinookcorp.com
## 
## [3 rows x 15 columns]

24. Ordering your SQL records with ORDER BY



# Import packages
from sqlalchemy import create_engine
import pandas as pd

file = 'sqlite:///c:/blogdown/Chinook.sqlite'

# Create engine: engine
engine = create_engine(file)

# Open engine in context manager
with engine.connect() as con:
    rs = con.execute("SELECT * FROM Employee ORDER BY BirthDate")
    df = pd.DataFrame(rs.fetchall())

    # Set the DataFrame's column names
    df.columns = rs.keys()

# Print head of DataFrame
print(df.head())
##    EmployeeId  LastName  ...                Fax                     Email
## 0           4      Park  ...  +1 (403) 263-4289  margaret@chinookcorp.com
## 1           2   Edwards  ...  +1 (403) 262-3322     nancy@chinookcorp.com
## 2           1     Adams  ...  +1 (780) 428-3457    andrew@chinookcorp.com
## 3           5   Johnson  ...   1 (780) 836-9543     steve@chinookcorp.com
## 4           8  Callahan  ...  +1 (403) 467-8772     laura@chinookcorp.com
## 
## [5 rows x 15 columns]

25. Pandas and The Hello World of SQL Queries!



# Import packages
from sqlalchemy import create_engine
import pandas as pd

file = 'sqlite:///c:/blogdown/Chinook.sqlite'

# Create engine: engine
engine = create_engine(file)

# Execute query and store records in DataFrame: df
df = pd.read_sql_query("SELECT * FROM Album", engine)

# Print head of DataFrame
print(df.head())

# Open engine in context manager and store query result in df1
##    AlbumId                                  Title  ArtistId
## 0        1  For Those About To Rock We Salute You         1
## 1        2                      Balls to the Wall         2
## 2        3                      Restless and Wild         2
## 3        4                      Let There Be Rock         1
## 4        5                               Big Ones         3
with engine.connect() as con:
    rs = con.execute("SELECT * FROM Album")
    df1 = pd.DataFrame(rs.fetchall())
    df1.columns = rs.keys()

# Confirm that both methods yield the same result
print(df.equals(df1))
## True

26. Pandas for more complex querying


# Import packages
from sqlalchemy import create_engine
import pandas as pd

file = 'sqlite:///c:/blogdown/Chinook.sqlite'

# Create engine: engine
engine = create_engine(file)


# Execute query and store records in DataFrame: df
df = pd.read_sql_query("SELECT * FROM Employee WHERE EmployeeId >= 6 ORDER BY BirthDate", engine)

# Print head of DataFrame
print(df.head())
##    EmployeeId  LastName  ...                Fax                    Email
## 0           8  Callahan  ...  +1 (403) 467-8772    laura@chinookcorp.com
## 1           7      King  ...  +1 (403) 456-8485   robert@chinookcorp.com
## 2           6  Mitchell  ...  +1 (403) 246-9899  michael@chinookcorp.com
## 
## [3 rows x 15 columns]

27. The power of SQL lies in relationships between tables: INNER JOIN


# Import packages
from sqlalchemy import create_engine
import pandas as pd

file = 'sqlite:///c:/blogdown/Chinook.sqlite'

# Create engine: engine
engine = create_engine(file)

# Open engine in context manager
# Perform query and save results to DataFrame: df
with engine.connect() as con:
    rs = con.execute("SELECT Title, Name FROM Album INNER JOIN Artist on Album.ArtistID = Artist.ArtistID")
    df = pd.DataFrame(rs.fetchall())
    df.columns = rs.keys()

# Print head of DataFrame df
print(df.head())
##                                    Title       Name
## 0  For Those About To Rock We Salute You      AC/DC
## 1                      Balls to the Wall     Accept
## 2                      Restless and Wild     Accept
## 3                      Let There Be Rock      AC/DC
## 4                               Big Ones  Aerosmith

28. Filtering your INNER JOIN



# Import packages
from sqlalchemy import create_engine
import pandas as pd

file = 'sqlite:///c:/blogdown/Chinook.sqlite'

# Create engine: engine
engine = create_engine(file)

# Execute query and store records in DataFrame: df
df = pd.read_sql_query("SELECT * FROM PlaylistTrack INNER JOIN Track on PlaylistTrack.TrackId = Track.TrackId WHERE Milliseconds < 250000", engine)

# Print head of DataFrame
print(df.head())
##    PlaylistId  TrackId  TrackId  ... Milliseconds    Bytes  UnitPrice
## 0           1     3390     3390  ...       217732  3559040       0.99
## 1           1     3392     3392  ...       230758  3766605       0.99
## 2           1     3393     3393  ...       218916  3577821       0.99
## 3           1     3394     3394  ...       228366  3728955       0.99
## 4           1     3395     3395  ...       213831  3497176       0.99
## 
## [5 rows x 11 columns]
comments powered by Disqus