# - Data manipulation -
import numpy as np                                     # numerical python
import pandas as pd                                    # python based analysis for data scientists

import missingno as missing                            # missing data explorer
from sklearn.preprocessing import MinMaxScaler         # data scaler
from scipy.interpolate import interp1d                 # interpolator

# - ML models -
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr                       # lin. corr. strength

# - Visualization -
import matplotlib.pyplot as plt                        # plotter
import seaborn as sb                                   # advanced plotter

# - Interactive plots (offline mode) -
import plotly                                          
import plotly.graph_objects as go
import plotly.offline as ply
import cufflinks as cf

# - Local libraries -
import sys
import copy                                           # for deepcopy

# - Misc extension -
from IPython.display import HTML, display
import re


# --Global parameters and settings--

# pandas:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# matplotlib and seaborn:
%matplotlib inline
sb.set_theme(color_codes= True)
sb.set_palette("pastel")

# plotly and cufflinks
plotly.offline.init_notebook_mode(connected=True)
cf.go_offline(connected=True)
cf.set_config_file(theme='white')

# misc:
if 'autoreload' not in get_ipython().extension_manager.loaded:  
    %load_ext autoreload
%autoreload 2;

# parameters:
table_count = 1
figure_count = 1


# OBVERSATION DATA:
# load data
observ_data = pd.read_csv('observations.csv')

############################################################################

# OBSERVATION data processing:

# check for missing or duplicates
# missing.matrix(observ_data, figsize=(30,5))
# result: no missing or NAN labeled data

# check labels
# result: some labels are to long
# action: drop 'National' from park names
drop_NP = lambda entry : re.sub(' National Park', '', entry)
observ_data.park_name = observ_data.park_name.apply(drop_NP)

# check data types
# display(observ_data.dtypes)
# result: some categorical data has type object
# action: categorical data gets proper data type
observ_data.park_name = observ_data.park_name.astype('category')
observ_data.scientific_name = observ_data.scientific_name.astype('category')

############################################################################

# OBSERVATION DATA INFO:

# OBSERVATION data categories
display(HTML('<h3>Observation Data Categories: %s</h3>' % ', '.join(observ_data.columns).upper()))

# OBSERVATION final data types
observ_types = pd.DataFrame(observ_data.dtypes)
observ_types.reset_index()
observ_types.columns = ['data type']
display(HTML('<h3>Table %d: Observation Data Types</h3>' % table_count))
table_count += 1
display(observ_types)

# OBSERVATION data content:
display(HTML('<h3>Table %d: Observation Data Excerpt</h3>' % table_count))
table_count += 1
display(observ_data)

# OBSERVATION data length:
display(HTML('<h3>Observation Data Length: %d species</h3>' % len(observ_data)))


# SPECIES DATA:
# load data
species_data = pd.read_csv('species_info.csv')

############################################################################

# SPECIES DATA PROCESSING:

# check for missing data
# missing.matrix(species_data, figsize=(30,5))      
# result:'conservation_status' has NAN labels.
# action: rename NAN to 'Unknown'
species_data.conservation_status = species_data.conservation_status.fillna('Unknown')

# check for duplicated data
num_species_duplicates = species_data.scientific_name.duplicated().sum()
# result: duplicated 'scientific_name' due to multiple 'common_name' entries
# action: drop of duplicated 'scientific_name' entris
species_data = species_data.drop_duplicates(subset='scientific_name', keep="first")
#display(HTML('<p>Note: Due to multiple "common name" entries, the data had %s duplicate "scientific name" entries. The table was cleaned. Furthermore, the "type" column was added.</p>' % num_species_duplicates))

# check data types:
# species_data.dtypes
# result: some categorical data has type object
# action: categorical data gets proper data type
category_cat = pd.Categorical(species_data.category, categories=['Reptile', 'Amphibian', 'Fish', 'Mammal', 'Bird', 'Nonvascular Plant', 'Vascular Plant'], ordered=False)
species_data['category'] = category_cat
species_data.scientific_name = species_data.scientific_name.astype('category')
conserv_cat = pd.Categorical(species_data.conservation_status, categories=['Unknown', 'Species of Concern', 'In Recovery', 'Threatened', 'Endangered'], ordered=True)
species_data['conservation'] = conserv_cat

# introduction of species types (animal, plant): will come in handy later
PLANTS = ['Vascular Plant', 'Nonvascular Plant']
ANIMALS = ['Amphibian', 'Bird', 'Fish', 'Mammal', 'Reptile']
type_sort = lambda row : 'Animal' if row['category'] in ANIMALS else 'Plant'
species_data['type'] = species_data.apply(type_sort, axis=1)
species_data.type = species_data.type.astype('category')
species_data = species_data[['type', 'category', 'scientific_name', 'common_names', 'conservation']]

############################################################################

# SPECIES DATA INFO:

# SPECIES data categories
display(HTML('<h3>Species Data Categories: %s</h3>' % ', '.join(species_data.columns).upper()))

# SPECIES final data types:
species_datatypes = pd.DataFrame(species_data.dtypes)
species_datatypes.reset_index()
species_datatypes.columns = ['data type']
display(HTML('<h3>Table %d: Species Data Types</h3>' % table_count))
table_count += 1
display(species_datatypes)

# SPECIES data content
display(HTML('<h3>Table %d: Species Data Excerpt</h3>' % table_count))
table_count += 1
display(species_data)

# SPECIES data length:
display(HTML('<h3>Species Data Length: %d species</h3>' % len(observ_data)))


# MERGED DATA:
# group by observations
observ_data_total = observ_data.groupby(observ_data.scientific_name).sum().reset_index()
# merge
merged_data = pd.merge(species_data.drop(columns=['common_names']), observ_data_total, how='left', on=['scientific_name'])
display(HTML('<h3>Table %d: Merged tables with summed observations</h3>' % table_count))
table_count += 1
display(merged_data)

# Merged data encoded:
data_catcode = copy.deepcopy(merged_data)
# create encoding map:
encode_map = list()
encode_map.append(dict(zip(data_catcode.type.values, data_catcode.type.cat.codes)))
encode_map.append(dict(zip(data_catcode.category.values, data_catcode.category.cat.codes)))
encode_map.append(dict(zip(data_catcode.conservation.values, data_catcode.conservation.cat.codes)))
# encode data:
data_catcode.type = data_catcode.type.cat.codes
data_catcode.category = data_catcode.category.cat.codes
data_catcode.conservation = data_catcode.conservation.cat.codes
# index and column sort:
data_catcode = data_catcode.set_index('scientific_name')
data_catcode.columns = ['type', 'category', 'conservation', 'observations']
encode_map = dict(zip(data_catcode.columns[:-1], encode_map))

display(HTML('<h3>Table %d: Merged tables with summed observations (encoded)</h3>' % table_count))
table_count += 1
display(data_catcode)

display(HTML('<h3>Categorical Encoding Map:</h3><p>%s</p>' %encode_map))


# BIODIVERSITY
species_types = merged_data.type.unique()
display(HTML('<h3>Species Types: %s</h3>' % ', '.join(species_types)))
species_categories = merged_data.category.unique()
display(HTML('<h3>Species Categories: %s</h3>' % ', '.join(species_categories)))

# species per category:
species_count = merged_data.groupby('category', axis=0).count()
species_count = species_count.type.sort_values(ascending=False).reset_index()
species_count.columns = ['category', 'species_count']
display(HTML('<h3>Table %d: Number of Species per Category</h3>' % table_count))
table_count += 1
display(species_count)


# Sum and display species types and categories:
# -- TYPES --
animal_plant_data = merged_data.groupby('type').sum().reset_index()
species_colors = ["#ca684d", "#7ba24f"]
#display(animal_plant_data)

# -- ANIMALS --
animal_data = merged_data[merged_data.type == 'Animal'].groupby('category').sum().reset_index()
animal_data = animal_data.drop(animal_data.index[-2:])
animal_colors = ["#e6b8b3", "#9bcde5", "#d6d8b4", "#cdbedd", "#aad2bf"]
#display(animal_data)

# -- PLANTS --
plant_data = merged_data[merged_data.type == 'Plant'].groupby('category').sum().reset_index()
plant_data = plant_data.drop(plant_data.index[0:5])
plant_colors = ["#d7da5b", "#81e799"]
#display(plant_data)


display(HTML('<h3>Figure %d: Biodiversity per Category</h3>' % figure_count))
figure_count += 1
fig = plt.figure(figsize=(15,5))
ax1 = plt.subplot(1,3,1)
plt.pie(x=animal_plant_data.observations, autopct="%.0f%%", pctdistance=0.7, explode=[0.05]*2, startangle=0, colors=species_colors, labels=animal_plant_data.type)
ax1.axis('equal')
ax1.set_title('Species Categories')
ax2 = plt.subplot(1,3,2)
plt.pie(x=plant_data.observations, autopct="%.0f%%", pctdistance=0.7, explode=[0.05]*2 , startangle=100, colors=plant_colors, labels=plant_data.category)
ax2.axis('equal')
ax2.set_title('Plant Categories')
ax3 = plt.subplot(1,3,3)
plt.pie(x=animal_data.observations, autopct="%.0f%%", pctdistance=0.7, explode=[0.05]*5 , startangle=-30, colors=animal_colors, labels=animal_data.category)
ax3.axis('equal')
ax3.set_title('Animal Categories')
plt.show()
plt.clf()

<Figure size 432x288 with 0 Axes>


# BIODIVERSITY PER NATIONAL PARK
NATIONAL_PARKS = observ_data.park_name.unique()
display(HTML('<h3>National Parks: %s</h3>' % ', '.join(NATIONAL_PARKS)))

# Counts per park pivoted:
observ_park = observ_data.pivot_table(index='scientific_name', columns='park_name', values='observations', aggfunc=[sum], margins=True, margins_name='Total Count')
display(HTML('<h3>Table %d: Observed Spiecies per National Park</h3>' % table_count))
table_count += 1
display(observ_park)

display(HTML('<h3>Figure 1: Observed Individuals per National Park</h3>'))
temp_frame = pd.DataFrame([NATIONAL_PARKS,[observ_park[('sum', park)][-1] for park in NATIONAL_PARKS]], columns=['key', 'value'])
#display(temp_frame)
sb.barplot(x=temp_frame.value, y=temp_frame.key, data=temp_frame)
plt.xlabel('Individual Counts (millions)')
plt.ylabel('National Park')
plt.title('Flora and Founa Count per National Park')
plt.show()
plt.clf()

<Figure size 432x288 with 0 Axes>


# merging of species info and counts per park
temp_dict = {'scientific_name':observ_park.index[:-1].values}
for park in NATIONAL_PARKS:
    temp_dict[park] = observ_park[('sum', park)][:-1].values
counts_per_park = pd.DataFrame(data=temp_dict)
merged_counts_parks = pd.merge(species_data.drop(columns=['type', 'common_names', 'conservation']), counts_per_park, how='left', on=['scientific_name'])
#display(merged_counts_parks)

# separation in categories for park diversity
park_palette = sb.color_palette("pastel", 4)
park_colors = dict(zip(NATIONAL_PARKS, park_palette))
park_dict = {'1':'Bryce', '0':'Great Smoky Mountains', '2':'Yosemite', '3':'Yellowstone'}

def return_zero(x):
    return 0

# Species < 500 tables:
selected_types = ['Reptile', 'Amphibian', 'Mammal', 'Bird']
type_counts_park = dict()
type_counts_interpol = dict()
for species_type in selected_types:
    display(HTML('<h3>Figure {figure_number}: {type_string} Species Distribution per Park:</h3>'.format(figure_number=figure_count, type_string=species_type)))
    figure_count += 1
    # Tables:
    type_counts_park[species_type] = merged_counts_parks[merged_counts_parks.category == species_type].reset_index(drop=True)
    type_counts_park[species_type].scientific_name = type_counts_park[species_type].scientific_name.astype('category')
    type_counts_park[species_type].insert(loc=1, column='name_id', value=type_counts_park[species_type].scientific_name.cat.codes)
    type_counts_park[species_type] = type_counts_park[species_type].drop(columns=['scientific_name']) 
    # Figures:
    x_length = len(type_counts_park[species_type].name_id)
    x_new = np.linspace(0, x_length-1, num=x_length*3, endpoint=True)
    type_counts_interpol[species_type] = [return_zero]
    plt.figure(figsize=(20,4))
    plt.title('%s Species Distribution per National Park' % species_type)
    plt.xlabel('Species (Encoded)')
    plt.ylabel('Species Count')
    for i in range(4):
        type_counts_interpol[species_type].append(interp1d(type_counts_park[species_type].name_id, type_counts_park[species_type][park_dict[str(i)]], kind='cubic'))
        plt.plot(x_new, type_counts_interpol[species_type][i+1](x_new), '-', color=park_colors[park_dict[str(i)]], label=park_dict[str(i)])
        plt.fill_between(x_new, type_counts_interpol[species_type][i](x_new), type_counts_interpol[species_type][i+1](x_new), color=park_colors[park_dict[str(i)]])
        plt.legend()
    plt.show()
    plt.clf()

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>


# CONSERVATION STATES:
species_states = copy.deepcopy(species_data[['category', 'scientific_name', 'conservation']])
species_states['threat_level'] = data_catcode.conservation.values

conservation_labels = species_states.conservation.unique()
display(HTML('<h3>Species Conservation Categories: %s</h3>' % ', '.join(conservation_labels)))

# conservation status counts:
conservation_counts = species_states.groupby(['conservation', 'threat_level']).count().sort_values(by='conservation', ascending=False).reset_index().drop(columns=['scientific_name']).dropna()
conservation_counts.columns = ['conservation', 'threat_level', 'species count']
display(HTML('<h3>Table %d: Species Conservation State</h3>' % table_count))
table_count += 1
display(conservation_counts)
display(HTML('<p>Note: Here, the threat level" respectively, is the orderly encode of "conservation". The order is based on the "IUCN Red List of Threatened Species".</p>'))


# pivoted table:
species_pivot_table = species_states.pivot_table(index=['category', 'scientific_name'], columns='conservation', values='threat_level', aggfunc=[max], margins=True, margins_name='Threat Level')
display(HTML('<h3>Table %d: Spiecies Conservation states overview</h3>' % table_count))
table_count += 1
display(species_pivot_table)


# distill observ over conserv data
observ_conserv_data = copy.deepcopy(merged_data[['category', 'observations']])
observ_conserv_data['conservation'] = data_catcode.conservation.values

display(HTML('<h3>Figure %d: Species Observations per Category over Conservation Status</h3>' % figure_count))
figure_count += 1
ax = sb.lmplot(data=observ_conserv_data, x='conservation', y='observations', col='category', col_wrap=3, x_estimator=np.mean)
plt.xticks([x for x in range(5)])
plt.xlim(-0.2,4.2)
ax.set_xticklabels(['Unknown', 'Concern', 'Recov.', 'Threat.', 'Endang.'])
plt.show()
plt.clf()

<Figure size 432x288 with 0 Axes>


# Category to conservation correlation:
display(HTML('<h3>Table %d: Correlations</h3>' % table_count))
table_count += 1
correlation_frame = data_catcode.groupby('observations').mean().reset_index()
correlation_frame = correlation_frame.corr()
display(correlation_frame)


#conservation medians
median_frame = data_catcode.groupby(data_catcode.conservation).median().reset_index()
median_frame = median_frame[['type', 'category', 'conservation', 'observations']]
#display(median_frame)


def return_key(key_code, key_chain):
    for key in key_chain.keys():
        if key_chain[key] == key_code:
            return key
median_frame_unencode = copy.deepcopy(median_frame)
for entity in encode_map:
    entity_lambda = lambda row : return_key(row[entity], encode_map[entity])
    median_frame_unencode[entity] = median_frame_unencode.apply(entity_lambda, axis=1)

display(HTML('<h3>Table %d: Medians (unencoded)</h3>' % table_count))
table_count += 1
display(median_frame_unencode)

display(HTML('<h3>Figure %d: Spiecies Category over Threat Level</h3>' % figure_count))
figure_count += 1
sb.set_palette('coolwarm')
ax = sb.barplot(x=median_frame.conservation, y=median_frame_unencode.category, ci=None)
plt.xticks([x for x in range(5)])
plt.xlim(-0.2,4.2)
ax.set_xticklabels(['Unknown', 'Concern', 'Recov.', 'Threat.', 'Endang.'])
ax.set_title('Spiecies Category Threat Level')
plt.show()
plt.clf()
sb.set_palette('pastel')

<Figure size 432x288 with 0 Axes>


ratio_dict = {'category':[], 'threat_percent':[]}
for species_cat in species_categories:
    temp_data = merged_data[merged_data.category == species_cat]
    total_count = temp_data.scientific_name.count()
    endangered_count = temp_data[temp_data.conservation.isin(['Endangered', 'Threatened'])].scientific_name.count()
    ratio_dict['category'].append(species_cat)
    ratio_dict['threat_percent'].append(round(endangered_count * 100.0 / total_count,1))
ratio_data = pd.DataFrame(ratio_dict)
ratio_data = ratio_data.sort_values(by='threat_percent', ascending=False)
display(ratio_data)


# Spieces count frequencies:
animal_freq = merged_data[merged_data.type == 'Animal'][['scientific_name', 'observations']].reset_index().drop(columns='index')
plant_freq = merged_data[merged_data.type == 'Plant'][['scientific_name', 'observations']].reset_index().drop(columns='index')

display(HTML('<h3>Figure %d: Species Count Frequencies</h3>' % figure_count))
figure_count += 1
plt.figure(figsize=(20,8))
sb.histplot(data=plant_freq, x='observations', kde=True, label='Plants', color=species_colors[1])
sb.histplot(data=animal_freq, x='observations', kde=True, label='Animals', color=species_colors[0])
plt.xlim(0,1400)
plt.xlabel('Species Observations')
plt.ylabel('Frequency')
plt.title('Species Ovservations Frequency')
plt.legend()
plt.show()
plt.clf()

<Figure size 432x288 with 0 Axes>

	scientific_name	park_name	observations
0	Vicia benghalensis	Great Smoky Mountains	68
1	Neovison vison	Great Smoky Mountains	77
2	Prunus subcordata	Yosemite	138
3	Abutilon theophrasti	Bryce	84
4	Githopsis specularioides	Great Smoky Mountains	85
...	...	...	...
23291	Croton monanthogynus	Yosemite	173
23292	Otospermophilus beecheyi	Bryce	130
23293	Heterotheca sessiliflora ssp. echioides	Bryce	140
23294	Dicranella rufescens	Yosemite	171
23295	Cucurbita pepo	Yosemite	164

	type	category	conservation	observations
scientific_name
Clethrionomys gapperi gapperi	0	3	0	615
Bos bison	0	3	0	542
Bos taurus	0	3	0	514
Ovis aries	0	3	0	542
Cervus elaphus	0	3	0	1218
...	...	...	...	...
Solanum parishii	1	6	0	574
Solanum xanti	1	6	0	575
Parthenocissus vitacea	1	6	0	583
Vitis californica	1	6	0	562
Tribulus terrestris	1	6	0	556

	sum
park_name	Bryce	Great Smoky Mountains	Yellowstone	Yosemite	Total Count
scientific_name
Abies bifolia	109	72	215	136	532
Abies concolor	83	101	241	205	630
Abies fraseri	109	81	218	110	518
Abietinella abietina	101	65	243	183	592
Abronia ammophila	92	72	222	137	523
...	...	...	...	...	...
Zonotrichia leucophrys oriantha	73	123	227	135	558
Zonotrichia querula	105	83	268	160	616
Zygodon viridissimus	100	71	270	159	600
Zygodon viridissimus var. rupestris	102	102	237	210	651
Total Count	576025	431820	1443562	863332	3314739

		max
	conservation	Unknown	Species of Concern	In Recovery	Threatened	Endangered	Threat Level
category	scientific_name
Reptile	Agkistrodon contortrix mokasen	0.0	NaN	NaN	NaN	NaN	0
	Anolis carolinensis carolinensis	0.0	NaN	NaN	NaN	NaN	0
	Apalone spinifera spinifera	0.0	NaN	NaN	NaN	NaN	0
	Aspidoscelis tigris munda	0.0	NaN	NaN	NaN	NaN	0
	Carphophis	0.0	NaN	NaN	NaN	NaN	0
...	...	...	...	...	...	...	...
Vascular Plant	Zigadenus venenosus var. venenosus	0.0	NaN	NaN	NaN	NaN	0
	Zizia aptera	0.0	NaN	NaN	NaN	NaN	0
	Zizia aurea	0.0	NaN	NaN	NaN	NaN	0
	Zizia trifoliata	NaN	1.0	NaN	NaN	NaN	1
Threat Level		0.0	1.0	2.0	3.0	4.0	4

	observations	type	category	conservation
observations	1.000000	0.081908	0.156756	-0.415458
type	0.081908	1.000000	0.922266	-0.507811
category	0.156756	0.922266	1.000000	-0.550609
conservation	-0.415458	-0.507811	-0.550609	1.000000

	type	category	scientific_name	common_names	conservation
0	Animal	Mammal	Clethrionomys gapperi gapperi	Gapper's Red-Backed Vole	Unknown
1	Animal	Mammal	Bos bison	American Bison, Bison	Unknown
2	Animal	Mammal	Bos taurus	Aurochs, Aurochs, Domestic Cattle (Feral), Dom...	Unknown
3	Animal	Mammal	Ovis aries	Domestic Sheep, Mouflon, Red Sheep, Sheep (Feral)	Unknown
4	Animal	Mammal	Cervus elaphus	Wapiti Or Elk	Unknown
...	...	...	...	...	...
5819	Plant	Vascular Plant	Solanum parishii	Parish's Nightshade	Unknown
5820	Plant	Vascular Plant	Solanum xanti	Chaparral Nightshade, Purple Nightshade	Unknown
5821	Plant	Vascular Plant	Parthenocissus vitacea	Thicket Creeper, Virginia Creeper, Woodbine	Unknown
5822	Plant	Vascular Plant	Vitis californica	California Grape, California Wild Grape	Unknown
5823	Plant	Vascular Plant	Tribulus terrestris	Bullhead, Caltrop, Goathead, Mexican Sandbur, ...	Unknown

	category	species_count
0	Vascular Plant	4262
1	Bird	488
2	Nonvascular Plant	333
3	Mammal	176
4	Fish	125
5	Amphibian	79
6	Reptile	78

	conservation	threat_level	species count
0	Endangered	4	15.0
6	Threatened	3	9.0
12	In Recovery	2	3.0
18	Species of Concern	1	151.0
24	Unknown	0	5363.0

Biodiversity in National Parks¶

Intro

Methodology

Summary & Conclusions

Suggested Proceeding

Setup¶

Library imports¶

Settings and Parameters¶

Data info and processing¶

Observation data from NPS¶

Observation Data Categories: SCIENTIFIC_NAME, PARK_NAME, OBSERVATIONS

Table 1: Observation Data Types

Table 2: Observation Data Excerpt

Observation Data Length: 23296 species

Species Info Data from NPS¶

Species Data Categories: TYPE, CATEGORY, SCIENTIFIC_NAME, COMMON_NAMES, CONSERVATION

Table 3: Species Data Types

Table 4: Species Data Excerpt

Species Data Length: 23296 species

Merged data¶

Table 5: Merged tables with summed observations

Table 6: Merged tables with summed observations (encoded)

Categorical Encoding Map:

Data Analysis (EDA)¶

Biodiversity Overall¶

Species Types: Animal, Plant

Species Categories: Mammal, Bird, Reptile, Amphibian, Fish, Vascular Plant, Nonvascular Plant

Table 7: Number of Species per Category

Figure 1: Biodiversity per Category

Biodiversity per National Park¶

National Parks: Great Smoky Mountains, Yosemite, Bryce, Yellowstone

Table 8: Observed Spiecies per National Park

Figure 1: Observed Individuals per National Park

Figure 2: Reptile Species Distribution per Park:

Figure 3: Amphibian Species Distribution per Park:

Figure 4: Mammal Species Distribution per Park:

Figure 5: Bird Species Distribution per Park:

Species Conservation states¶

Species Conservation Categories: Unknown, Species of Concern, Endangered, Threatened, In Recovery

Table 9: Species Conservation State

Table 10: Spiecies Conservation states overview

Figure 6: Species Observations per Category over Conservation Status

Correlation investigation¶

Table 11: Correlations

Table 12: Medians (unencoded)

Figure 7: Spiecies Category over Threat Level

Additional investigation¶

Figure 8: Species Count Frequencies

References¶

Data Sources¶