Week 6 Videos
Contents
Week 6 Videos¶
Merging two DataFrames, Part 1¶
import pandas as pd
df = pd.read_csv("../data/indexData.csv")
df_info = pd.read_csv("../data/indexInfo.csv")
df.head()
Name | Date | Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|---|---|
0 | NYA | 12/31/65 | 528.690002 | 528.690002 | 528.690002 | 528.690002 | 528.690002 | 0.0 |
1 | NYA | 1/3/66 | 527.210022 | 527.210022 | 527.210022 | 527.210022 | 527.210022 | 0.0 |
2 | NYA | 1/4/66 | 527.840027 | 527.840027 | 527.840027 | 527.840027 | 527.840027 | 0.0 |
3 | NYA | 1/5/66 | 531.119995 | 531.119995 | 531.119995 | 531.119995 | 531.119995 | 0.0 |
4 | NYA | 1/6/66 | 532.070007 | 532.070007 | 532.070007 | 532.070007 | 532.070007 | 0.0 |
df["Name"].unique()
array(['NYA', 'IXIC', 'HSI', '000001.SS', 'GSPTSE', '399001.SZ', 'NSEI',
'GDAXI', 'KS11', 'SSMI', 'TWII', 'J203.JO', 'N225', 'N100'],
dtype=object)
df_info.head()
Region | Exchange | Index | Currency | |
---|---|---|---|---|
0 | United States | New York Stock Exchange | NYA | USD |
1 | United States | NASDAQ | IXIC | USD |
2 | Hong Kong | Hong Kong Stock Exchange | HSI | HKD |
3 | China | Shanghai Stock Exchange | 000001.SS | CNY |
4 | Japan | Tokyo Stock Exchange | N225 | JPY |
df.merge(df_info, how="left", left_on="Name", right_on="Index")
Name | Date | Open | High | Low | Close | Adj Close | Volume | Region | Exchange | Index | Currency | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NYA | 12/31/65 | 528.690002 | 528.690002 | 528.690002 | 528.690002 | 528.690002 | 0.0 | United States | New York Stock Exchange | NYA | USD |
1 | NYA | 1/3/66 | 527.210022 | 527.210022 | 527.210022 | 527.210022 | 527.210022 | 0.0 | United States | New York Stock Exchange | NYA | USD |
2 | NYA | 1/4/66 | 527.840027 | 527.840027 | 527.840027 | 527.840027 | 527.840027 | 0.0 | United States | New York Stock Exchange | NYA | USD |
3 | NYA | 1/5/66 | 531.119995 | 531.119995 | 531.119995 | 531.119995 | 531.119995 | 0.0 | United States | New York Stock Exchange | NYA | USD |
4 | NYA | 1/6/66 | 532.070007 | 532.070007 | 532.070007 | 532.070007 | 532.070007 | 0.0 | United States | New York Stock Exchange | NYA | USD |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
112452 | N100 | 5/27/21 | 1241.119995 | 1251.910034 | 1241.119995 | 1247.069946 | 1247.069946 | 379696400.0 | Europe | Euronext | N100 | EUR |
112453 | N100 | 5/28/21 | 1249.469971 | 1259.209961 | 1249.030029 | 1256.599976 | 1256.599976 | 160773400.0 | Europe | Euronext | N100 | EUR |
112454 | N100 | 5/31/21 | 1256.079956 | 1258.880005 | 1248.140015 | 1248.930054 | 1248.930054 | 91173700.0 | Europe | Euronext | N100 | EUR |
112455 | N100 | 6/1/21 | 1254.609985 | 1265.660034 | 1254.609985 | 1258.579956 | 1258.579956 | 155179900.0 | Europe | Euronext | N100 | EUR |
112456 | N100 | 6/2/21 | 1258.489990 | 1263.709961 | 1258.239990 | 1263.619995 | 1263.619995 | 148465000.0 | Europe | Euronext | N100 | EUR |
112457 rows × 12 columns
df.head()
Name | Date | Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|---|---|
0 | NYA | 12/31/65 | 528.690002 | 528.690002 | 528.690002 | 528.690002 | 528.690002 | 0.0 |
1 | NYA | 1/3/66 | 527.210022 | 527.210022 | 527.210022 | 527.210022 | 527.210022 | 0.0 |
2 | NYA | 1/4/66 | 527.840027 | 527.840027 | 527.840027 | 527.840027 | 527.840027 | 0.0 |
3 | NYA | 1/5/66 | 531.119995 | 531.119995 | 531.119995 | 531.119995 | 531.119995 | 0.0 |
4 | NYA | 1/6/66 | 532.070007 | 532.070007 | 532.070007 | 532.070007 | 532.070007 | 0.0 |
df = df.merge(df_info, how="left", left_on="Name", right_on="Index")
df.head()
Name | Date | Open | High | Low | Close | Adj Close | Volume | Region | Exchange | Index | Currency | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NYA | 12/31/65 | 528.690002 | 528.690002 | 528.690002 | 528.690002 | 528.690002 | 0.0 | United States | New York Stock Exchange | NYA | USD |
1 | NYA | 1/3/66 | 527.210022 | 527.210022 | 527.210022 | 527.210022 | 527.210022 | 0.0 | United States | New York Stock Exchange | NYA | USD |
2 | NYA | 1/4/66 | 527.840027 | 527.840027 | 527.840027 | 527.840027 | 527.840027 | 0.0 | United States | New York Stock Exchange | NYA | USD |
3 | NYA | 1/5/66 | 531.119995 | 531.119995 | 531.119995 | 531.119995 | 531.119995 | 0.0 | United States | New York Stock Exchange | NYA | USD |
4 | NYA | 1/6/66 | 532.070007 | 532.070007 | 532.070007 | 532.070007 | 532.070007 | 0.0 | United States | New York Stock Exchange | NYA | USD |
Merging two DataFrames, Part 2¶
df1 = pd.DataFrame({"A": range(0,5), "B": list("JKLMN")})
df2 = pd.DataFrame({"B": list("MATHEMATICAL"), "C": range(24,0,-2)})
df1
A | B | |
---|---|---|
0 | 0 | J |
1 | 1 | K |
2 | 2 | L |
3 | 3 | M |
4 | 4 | N |
df2
B | C | |
---|---|---|
0 | M | 24 |
1 | A | 22 |
2 | T | 20 |
3 | H | 18 |
4 | E | 16 |
5 | M | 14 |
6 | A | 12 |
7 | T | 10 |
8 | I | 8 |
9 | C | 6 |
10 | A | 4 |
11 | L | 2 |
df1.merge(df2, how="inner", on="B")
A | B | C | |
---|---|---|---|
0 | 2 | L | 2 |
1 | 3 | M | 24 |
2 | 3 | M | 14 |
df1.merge(df2, how="outer", on="B")
A | B | C | |
---|---|---|---|
0 | 0.0 | J | NaN |
1 | 1.0 | K | NaN |
2 | 2.0 | L | 2.0 |
3 | 3.0 | M | 24.0 |
4 | 3.0 | M | 14.0 |
5 | 4.0 | N | NaN |
6 | NaN | A | 22.0 |
7 | NaN | A | 12.0 |
8 | NaN | A | 4.0 |
9 | NaN | T | 20.0 |
10 | NaN | T | 10.0 |
11 | NaN | H | 18.0 |
12 | NaN | E | 16.0 |
13 | NaN | I | 8.0 |
14 | NaN | C | 6.0 |
df1.merge(df2, how="left", on="B")
A | B | C | |
---|---|---|---|
0 | 0 | J | NaN |
1 | 1 | K | NaN |
2 | 2 | L | 2.0 |
3 | 3 | M | 24.0 |
4 | 3 | M | 14.0 |
5 | 4 | N | NaN |
df1.merge(df2, how="right", on="B")
A | B | C | |
---|---|---|---|
0 | 3.0 | M | 24 |
1 | NaN | A | 22 |
2 | NaN | T | 20 |
3 | NaN | H | 18 |
4 | NaN | E | 16 |
5 | 3.0 | M | 14 |
6 | NaN | A | 12 |
7 | NaN | T | 10 |
8 | NaN | I | 8 |
9 | NaN | C | 6 |
10 | NaN | A | 4 |
11 | 2.0 | L | 2 |
df1.merge(df2, how="right", on="B").isna().any(axis=0)
A True
B False
C False
dtype: bool
Sub-DataFrame from 2007-2009¶
df = pd.read_csv("../data/indexData.csv")
df["Date"]
0 12/31/65
1 1/3/66
2 1/4/66
3 1/5/66
4 1/6/66
...
112452 5/27/21
112453 5/28/21
112454 5/31/21
112455 6/1/21
112456 6/2/21
Name: Date, Length: 112457, dtype: object
pd.to_datetime(df["Date"])
0 2065-12-31
1 2066-01-03
2 2066-01-04
3 2066-01-05
4 2066-01-06
...
112452 2021-05-27
112453 2021-05-28
112454 2021-05-31
112455 2021-06-01
112456 2021-06-02
Name: Date, Length: 112457, dtype: datetime64[ns]
df["Date"].map(lambda s: s[-2:])
0 65
1 66
2 66
3 66
4 66
..
112452 21
112453 21
112454 21
112455 21
112456 21
Name: Date, Length: 112457, dtype: object
df["Date"].map(lambda s: s[-2:]).isin(["07","08","09"])
0 False
1 False
2 False
3 False
4 False
...
112452 False
112453 False
112454 False
112455 False
112456 False
Name: Date, Length: 112457, dtype: bool
df["Date"].map(lambda s: s[-2:]).isin(["07","08","09"]).sum()
9555
df_sub = df[df["Date"].map(lambda s: s[-2:]).isin(["07","08","09"])].copy()
df_sub.tail()
Name | Date | Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|---|---|
109533 | N100 | 12/24/09 | 679.609985 | 680.590027 | 678.789978 | 679.340027 | 679.340027 | 40077000.0 |
109534 | N100 | 12/28/09 | 683.280029 | 685.260010 | 682.710022 | 684.559998 | 684.559998 | 0.0 |
109535 | N100 | 12/29/09 | 686.109985 | 689.830017 | 685.280029 | 687.890015 | 687.890015 | 131860500.0 |
109536 | N100 | 12/30/09 | 686.549988 | 688.530029 | 682.750000 | 683.760010 | 683.760010 | 131415800.0 |
109537 | N100 | 12/31/09 | 686.820007 | 687.210022 | 683.760010 | 683.760010 | 683.760010 | 61050400.0 |
Conditions and Selections in Altair¶
import altair as alt
alt.data_transformers.enable('default', max_rows=10000)
DataTransformerRegistry.enable('default')
df_sub.isna().any()
Name False
Date False
Open True
High True
Low True
Close True
Adj Close True
Volume True
dtype: bool
df_sub = df_sub.dropna().copy()
df_sub.shape
(9518, 8)
df_info
Region | Exchange | Index | Currency | |
---|---|---|---|---|
0 | United States | New York Stock Exchange | NYA | USD |
1 | United States | NASDAQ | IXIC | USD |
2 | Hong Kong | Hong Kong Stock Exchange | HSI | HKD |
3 | China | Shanghai Stock Exchange | 000001.SS | CNY |
4 | Japan | Tokyo Stock Exchange | N225 | JPY |
5 | Europe | Euronext | N100 | EUR |
6 | China | Shenzhen Stock Exchange | 399001.SZ | CNY |
7 | Canada | Toronto Stock Exchange | GSPTSE | CAD |
8 | India | National Stock Exchange of India | NSEI | INR |
9 | Germany | Frankfurt Stock Exchange | GDAXI | EUR |
10 | Korea | Korea Exchange | KS11 | KRW |
11 | Switzerland | SIX Swiss Exchange | SSMI | CHF |
12 | Taiwan | Taiwan Stock Exchange | TWII | TWD |
13 | South Africa | Johannesburg Stock Exchange | J203.JO | ZAR |
df_temp = df_sub.merge(df_info, how="left", left_on="Name", right_on="Index")
df_temp["Date"] = pd.to_datetime(df_temp["Date"])
df_temp
Name | Date | Open | High | Low | Close | Adj Close | Volume | Region | Exchange | Index | Currency | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NYA | 2007-01-03 | 9137.959961 | 9209.299805 | 9092.950195 | 9133.080078 | 9133.080078 | 3.429160e+09 | United States | New York Stock Exchange | NYA | USD |
1 | NYA | 2007-01-04 | 9132.250000 | 9132.250000 | 9070.110352 | 9113.160156 | 9113.160156 | 0.000000e+00 | United States | New York Stock Exchange | NYA | USD |
2 | NYA | 2007-01-05 | 9113.160156 | 9113.160156 | 9002.330078 | 9025.040039 | 9025.040039 | 2.919400e+09 | United States | New York Stock Exchange | NYA | USD |
3 | NYA | 2007-01-08 | 9025.040039 | 9052.099609 | 8993.620117 | 9041.120117 | 9041.120117 | 2.763340e+09 | United States | New York Stock Exchange | NYA | USD |
4 | NYA | 2007-01-09 | 9041.099609 | 9048.269531 | 8984.200195 | 9023.900391 | 9023.900391 | 3.038380e+09 | United States | New York Stock Exchange | NYA | USD |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9513 | N100 | 2009-12-24 | 679.609985 | 680.590027 | 678.789978 | 679.340027 | 679.340027 | 4.007700e+07 | Europe | Euronext | N100 | EUR |
9514 | N100 | 2009-12-28 | 683.280029 | 685.260010 | 682.710022 | 684.559998 | 684.559998 | 0.000000e+00 | Europe | Euronext | N100 | EUR |
9515 | N100 | 2009-12-29 | 686.109985 | 689.830017 | 685.280029 | 687.890015 | 687.890015 | 1.318605e+08 | Europe | Euronext | N100 | EUR |
9516 | N100 | 2009-12-30 | 686.549988 | 688.530029 | 682.750000 | 683.760010 | 683.760010 | 1.314158e+08 | Europe | Euronext | N100 | EUR |
9517 | N100 | 2009-12-31 | 686.820007 | 687.210022 | 683.760010 | 683.760010 | 683.760010 | 6.105040e+07 | Europe | Euronext | N100 | EUR |
9518 rows × 12 columns
alt.Chart(df_temp).mark_line().encode(
x="Date",
y="Close",
color="Exchange"
)
sel = alt.selection_single(fields=["Exchange"], bind="legend")
alt.Chart(df_temp).mark_line().encode(
x="Date",
y="Close",
color="Exchange"
).add_selection(
sel
)
sel = alt.selection_single(fields=["Exchange"], bind="legend")
alt.Chart(df_temp).mark_line().encode(
x="Date",
y="Close",
color="Exchange",
opacity=alt.condition(sel, alt.value(1), alt.value(0.1))
).add_selection(
sel
)