Week 3 Tuesday Discussion
Contents
Week 3 Tuesday Discussion¶
Quiz 2 Review¶
import seaborn as sns
df = sns.load_dataset("titanic")
df
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.0000 | S | Second | man | True | NaN | Southampton | no | True |
887 | 1 | 1 | female | 19.0 | 0 | 0 | 30.0000 | S | First | woman | False | B | Southampton | yes | True |
888 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
889 | 1 | 1 | male | 26.0 | 0 | 0 | 30.0000 | C | First | man | True | C | Cherbourg | yes | True |
890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
891 rows × 15 columns
Question 1:
Write code to determine which columns of
df
contain missing values. Do the same to determine which rows ofdf
contain missing values.Write code to determine which rows of
df
contain at least 2 missing values.Create a sub-DataFrame of
df
which contains only rows which have at least one missing value.Create a sub-DataFrame of
df
which contains only rows which have at least one missing value andembark_town
listed as Southampton.
Code-Along Solutions¶
[c for c in df.columns if df[c].isna().any()]
['age', 'embarked', 'deck', 'embark_town']
[r for r in df.index if df.iloc[r].isna().any()]
[0,
2,
4,
5,
7,
8,
9,
12,
13,
14,
15,
16,
17,
18,
19,
20,
22,
24,
25,
26,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
53,
55,
56,
57,
58,
59,
60,
61,
63,
64,
65,
67,
68,
69,
70,
71,
72,
73,
74,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
89,
90,
91,
93,
94,
95,
98,
99,
100,
101,
103,
104,
105,
106,
107,
108,
109,
111,
112,
113,
114,
115,
116,
117,
119,
120,
121,
122,
125,
126,
127,
128,
129,
130,
131,
132,
133,
134,
135,
138,
140,
141,
142,
143,
144,
145,
146,
147,
149,
150,
152,
153,
154,
155,
156,
157,
158,
159,
160,
161,
162,
163,
164,
165,
166,
167,
168,
169,
171,
172,
173,
175,
176,
178,
179,
180,
181,
182,
184,
185,
186,
187,
188,
189,
190,
191,
192,
196,
197,
198,
199,
200,
201,
202,
203,
204,
206,
207,
208,
210,
211,
212,
213,
214,
216,
217,
219,
220,
221,
222,
223,
225,
226,
227,
228,
229,
231,
232,
233,
234,
235,
236,
237,
238,
239,
240,
241,
242,
243,
244,
246,
247,
249,
250,
253,
254,
255,
256,
258,
259,
260,
261,
264,
265,
266,
267,
270,
271,
272,
274,
276,
277,
278,
279,
280,
281,
282,
283,
284,
285,
286,
287,
288,
289,
290,
293,
294,
295,
296,
298,
300,
301,
302,
303,
304,
306,
308,
312,
313,
314,
315,
316,
317,
320,
321,
322,
323,
324,
326,
328,
330,
333,
334,
335,
338,
339,
342,
343,
344,
346,
347,
348,
349,
350,
351,
352,
353,
354,
355,
357,
358,
359,
360,
361,
362,
363,
364,
365,
367,
368,
371,
372,
373,
374,
375,
376,
378,
379,
380,
381,
382,
383,
384,
385,
386,
387,
388,
389,
391,
392,
395,
396,
397,
398,
399,
400,
401,
402,
403,
404,
405,
406,
407,
408,
409,
410,
411,
413,
414,
415,
416,
417,
418,
419,
420,
421,
422,
423,
424,
425,
426,
427,
428,
431,
432,
433,
436,
437,
439,
440,
441,
442,
443,
444,
446,
447,
448,
450,
451,
454,
455,
457,
458,
459,
461,
463,
464,
465,
466,
467,
468,
469,
470,
471,
472,
474,
475,
476,
477,
478,
479,
480,
481,
482,
483,
485,
488,
489,
490,
491,
493,
494,
495,
497,
499,
500,
501,
502,
503,
506,
507,
508,
509,
510,
511,
513,
514,
517,
518,
519,
521,
522,
524,
525,
526,
527,
528,
529,
530,
531,
532,
533,
534,
535,
537,
538,
541,
542,
543,
545,
546,
547,
548,
549,
551,
552,
553,
554,
555,
557,
559,
560,
561,
562,
563,
564,
565,
566,
567,
568,
569,
570,
573,
574,
575,
576,
578,
579,
580,
582,
584,
586,
588,
589,
590,
592,
593,
594,
595,
596,
597,
598,
600,
601,
602,
603,
604,
605,
606,
607,
608,
610,
611,
612,
613,
614,
615,
616,
617,
619,
620,
622,
623,
624,
626,
628,
629,
631,
633,
634,
635,
636,
637,
638,
639,
640,
642,
643,
644,
646,
648,
649,
650,
651,
652,
653,
654,
655,
656,
657,
658,
660,
661,
663,
664,
665,
666,
667,
668,
669,
670,
672,
673,
674,
675,
676,
677,
678,
680,
682,
683,
684,
685,
686,
687,
688,
691,
692,
693,
694,
695,
696,
697,
702,
703,
704,
705,
706,
708,
709,
711,
713,
714,
718,
719,
720,
721,
722,
723,
725,
726,
727,
728,
729,
731,
732,
733,
734,
735,
736,
738,
739,
740,
743,
744,
746,
747,
749,
750,
752,
753,
754,
755,
756,
757,
758,
760,
761,
762,
764,
766,
767,
768,
769,
770,
771,
773,
774,
775,
776,
777,
778,
780,
783,
784,
785,
786,
787,
788,
790,
791,
792,
793,
794,
795,
797,
798,
799,
800,
801,
803,
804,
805,
807,
808,
810,
811,
812,
813,
814,
815,
816,
817,
818,
819,
821,
822,
824,
825,
826,
827,
828,
829,
830,
831,
832,
833,
834,
836,
837,
838,
839,
840,
841,
842,
843,
844,
845,
846,
847,
848,
849,
850,
851,
852,
854,
855,
856,
858,
859,
860,
861,
863,
864,
865,
866,
868,
869,
870,
873,
874,
875,
876,
877,
878,
880,
881,
882,
883,
884,
885,
886,
888,
890]
[r for r in df.index if df.iloc[r].isna().sum() >= 2]
[5,
17,
19,
26,
28,
29,
32,
36,
42,
45,
46,
47,
48,
61,
64,
65,
76,
77,
82,
87,
95,
101,
107,
109,
121,
126,
140,
154,
158,
159,
168,
176,
180,
181,
186,
196,
198,
201,
214,
223,
229,
235,
240,
241,
250,
256,
260,
264,
270,
274,
277,
295,
300,
301,
304,
306,
324,
330,
334,
335,
347,
354,
358,
359,
364,
367,
368,
375,
384,
388,
409,
410,
411,
413,
415,
420,
425,
428,
431,
444,
451,
454,
459,
464,
466,
468,
470,
481,
485,
490,
495,
497,
502,
507,
511,
517,
522,
524,
531,
533,
538,
547,
552,
557,
560,
563,
564,
568,
573,
578,
584,
589,
593,
596,
598,
601,
602,
611,
612,
613,
629,
633,
639,
643,
648,
650,
653,
656,
667,
674,
680,
692,
697,
709,
718,
727,
732,
738,
739,
760,
766,
768,
773,
778,
783,
790,
792,
793,
825,
826,
828,
829,
832,
837,
846,
859,
863,
868,
878,
888]
df.loc[[r for r in df.index if df.loc[r].isna().any()]]
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
5 | 0 | 3 | male | NaN | 0 | 0 | 8.4583 | Q | Third | man | True | NaN | Queenstown | no | True |
7 | 0 | 3 | male | 2.0 | 3 | 1 | 21.0750 | S | Third | child | False | NaN | Southampton | no | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
884 | 0 | 3 | male | 25.0 | 0 | 0 | 7.0500 | S | Third | man | True | NaN | Southampton | no | True |
885 | 0 | 3 | female | 39.0 | 0 | 5 | 29.1250 | Q | Third | woman | False | NaN | Queenstown | no | False |
886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.0000 | S | Second | man | True | NaN | Southampton | no | True |
888 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
709 rows × 15 columns
Pre-Written Solutions¶
#Missing columns
[c for c in df.columns if df[c].isna().any()]
['age', 'embarked', 'deck', 'embark_town']
#Missing rows
[r for r in df.index if df.loc[r].isna().any()]
[0,
2,
4,
5,
7,
8,
9,
12,
13,
14,
15,
16,
17,
18,
19,
20,
22,
24,
25,
26,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
53,
55,
56,
57,
58,
59,
60,
61,
63,
64,
65,
67,
68,
69,
70,
71,
72,
73,
74,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
89,
90,
91,
93,
94,
95,
98,
99,
100,
101,
103,
104,
105,
106,
107,
108,
109,
111,
112,
113,
114,
115,
116,
117,
119,
120,
121,
122,
125,
126,
127,
128,
129,
130,
131,
132,
133,
134,
135,
138,
140,
141,
142,
143,
144,
145,
146,
147,
149,
150,
152,
153,
154,
155,
156,
157,
158,
159,
160,
161,
162,
163,
164,
165,
166,
167,
168,
169,
171,
172,
173,
175,
176,
178,
179,
180,
181,
182,
184,
185,
186,
187,
188,
189,
190,
191,
192,
196,
197,
198,
199,
200,
201,
202,
203,
204,
206,
207,
208,
210,
211,
212,
213,
214,
216,
217,
219,
220,
221,
222,
223,
225,
226,
227,
228,
229,
231,
232,
233,
234,
235,
236,
237,
238,
239,
240,
241,
242,
243,
244,
246,
247,
249,
250,
253,
254,
255,
256,
258,
259,
260,
261,
264,
265,
266,
267,
270,
271,
272,
274,
276,
277,
278,
279,
280,
281,
282,
283,
284,
285,
286,
287,
288,
289,
290,
293,
294,
295,
296,
298,
300,
301,
302,
303,
304,
306,
308,
312,
313,
314,
315,
316,
317,
320,
321,
322,
323,
324,
326,
328,
330,
333,
334,
335,
338,
339,
342,
343,
344,
346,
347,
348,
349,
350,
351,
352,
353,
354,
355,
357,
358,
359,
360,
361,
362,
363,
364,
365,
367,
368,
371,
372,
373,
374,
375,
376,
378,
379,
380,
381,
382,
383,
384,
385,
386,
387,
388,
389,
391,
392,
395,
396,
397,
398,
399,
400,
401,
402,
403,
404,
405,
406,
407,
408,
409,
410,
411,
413,
414,
415,
416,
417,
418,
419,
420,
421,
422,
423,
424,
425,
426,
427,
428,
431,
432,
433,
436,
437,
439,
440,
441,
442,
443,
444,
446,
447,
448,
450,
451,
454,
455,
457,
458,
459,
461,
463,
464,
465,
466,
467,
468,
469,
470,
471,
472,
474,
475,
476,
477,
478,
479,
480,
481,
482,
483,
485,
488,
489,
490,
491,
493,
494,
495,
497,
499,
500,
501,
502,
503,
506,
507,
508,
509,
510,
511,
513,
514,
517,
518,
519,
521,
522,
524,
525,
526,
527,
528,
529,
530,
531,
532,
533,
534,
535,
537,
538,
541,
542,
543,
545,
546,
547,
548,
549,
551,
552,
553,
554,
555,
557,
559,
560,
561,
562,
563,
564,
565,
566,
567,
568,
569,
570,
573,
574,
575,
576,
578,
579,
580,
582,
584,
586,
588,
589,
590,
592,
593,
594,
595,
596,
597,
598,
600,
601,
602,
603,
604,
605,
606,
607,
608,
610,
611,
612,
613,
614,
615,
616,
617,
619,
620,
622,
623,
624,
626,
628,
629,
631,
633,
634,
635,
636,
637,
638,
639,
640,
642,
643,
644,
646,
648,
649,
650,
651,
652,
653,
654,
655,
656,
657,
658,
660,
661,
663,
664,
665,
666,
667,
668,
669,
670,
672,
673,
674,
675,
676,
677,
678,
680,
682,
683,
684,
685,
686,
687,
688,
691,
692,
693,
694,
695,
696,
697,
702,
703,
704,
705,
706,
708,
709,
711,
713,
714,
718,
719,
720,
721,
722,
723,
725,
726,
727,
728,
729,
731,
732,
733,
734,
735,
736,
738,
739,
740,
743,
744,
746,
747,
749,
750,
752,
753,
754,
755,
756,
757,
758,
760,
761,
762,
764,
766,
767,
768,
769,
770,
771,
773,
774,
775,
776,
777,
778,
780,
783,
784,
785,
786,
787,
788,
790,
791,
792,
793,
794,
795,
797,
798,
799,
800,
801,
803,
804,
805,
807,
808,
810,
811,
812,
813,
814,
815,
816,
817,
818,
819,
821,
822,
824,
825,
826,
827,
828,
829,
830,
831,
832,
833,
834,
836,
837,
838,
839,
840,
841,
842,
843,
844,
845,
846,
847,
848,
849,
850,
851,
852,
854,
855,
856,
858,
859,
860,
861,
863,
864,
865,
866,
868,
869,
870,
873,
874,
875,
876,
877,
878,
880,
881,
882,
883,
884,
885,
886,
888,
890]
#At least 2 missing rows
more_1 = [r for r in df.index if df.loc[r].isna().sum() > 1]
df.loc[more_1].isna().sum(axis = 1)
5 2
17 2
19 2
26 2
28 2
..
859 2
863 2
868 2
878 2
888 2
Length: 160, dtype: int64
#Missing sub-DataFrame
df.loc[[r for r in df.index if df.loc[r].isna().any()]]
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
5 | 0 | 3 | male | NaN | 0 | 0 | 8.4583 | Q | Third | man | True | NaN | Queenstown | no | True |
7 | 0 | 3 | male | 2.0 | 3 | 1 | 21.0750 | S | Third | child | False | NaN | Southampton | no | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
884 | 0 | 3 | male | 25.0 | 0 | 0 | 7.0500 | S | Third | man | True | NaN | Southampton | no | True |
885 | 0 | 3 | female | 39.0 | 0 | 5 | 29.1250 | Q | Third | woman | False | NaN | Queenstown | no | False |
886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.0000 | S | Second | man | True | NaN | Southampton | no | True |
888 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
709 rows × 15 columns
#Missing + embark_town
df.loc[[r for r in df.index if df.loc[r].isna().any() and df.loc[r,"embark_town"] == "Southampton"]]
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
7 | 0 | 3 | male | 2.0 | 3 | 1 | 21.0750 | S | Third | child | False | NaN | Southampton | no | False |
8 | 1 | 3 | female | 27.0 | 0 | 2 | 11.1333 | S | Third | woman | False | NaN | Southampton | yes | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
882 | 0 | 3 | female | 22.0 | 0 | 0 | 10.5167 | S | Third | woman | False | NaN | Southampton | no | True |
883 | 0 | 2 | male | 28.0 | 0 | 0 | 10.5000 | S | Second | man | True | NaN | Southampton | no | True |
884 | 0 | 3 | male | 25.0 | 0 | 0 | 7.0500 | S | Third | man | True | NaN | Southampton | no | True |
886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.0000 | S | Second | man | True | NaN | Southampton | no | True |
888 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
529 rows × 15 columns
Question 2:
Consider the following Altair chart. Explain how you could use it to determine where
parch
is the largest and how many different uniqueembark_town
values there are.
import altair as alt
alt.Chart(df[df["age"] < 20]).mark_circle().encode(
x = alt.X("age"),
y = alt.Y("fare"),
color = alt.Color("embark_town", legend = None),
size = alt.Size("parch", legend = None)
)