Skip to content
101 changes: 97 additions & 4 deletions scripts/1-fetch/smithsonian_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,77 @@
"TOTAL_OBJECTS",
]
HEADER_2_UNITS = [
"UNIT",
"UNIT_CODE",
"UNIT_NAME",
"CC0_RECORDS",
"CC0_RECORDS_WITH_CC0_MEDIA",
"TOTAL_OBJECTS",
]
QUARTER = os.path.basename(PATHS["data_quarter"])

# Manually compiled unit code and name from URL
# 'https://github.com/Smithsonian/OpenAccess'
UNIT_MAP = {
"AAA": "Archives of American Art",
"AAG": "Archives of American Gardens",
"ACM": "Anacostia Community Museum",
"ACMA": "Anacostia Community Museum Archives",
"CFCHFOLKLIFE": "Ralph Rinzler Folklife Archives and Collections",
"CHNDM": "Cooper Hewitt, Smithsonian Design Museum",
"FBR": "Smithsonian Field Book Project",
"FSG": "Freer Gallery of Art and Arthur M. Sackler Gallery",
"HAC": "Smithsonian Gardens",
"HMSG": "Hirshhorn Museum and Sculpture Garden",
"HSFA": "Human Studies Film Archives",
"NASM": "National Air and Space Museum",
"NMAAHC": "National Museum of African American History and Culture",
"NMAH": "National Museum of American History",
"NMAI": "National Museum of the American Indian",
"NMAfA": "National Museum of African Art",
"NMNHANTHRO": ("National Musuem of Natural History - Anthropology Dept."),
"NMNHBIRDS": (
"National Musuem of Natural History"
" - Vertebrate Zoology - Birds Division"
),
"NMNHBOTANY": ("National Musuem of Natural History - Botany Dept."),
"NMNHEDUCATION": (
"National Musuem of Natural History" " - Education & Outreach"
),
"NMNHENTO": ("National Musuem of Natural History - Entomology Dept."),
"NMNHFISHES": (
"National Musuem of Natural History"
" - Vertebrate Zoology - Fishes Division"
),
"NMNHHERPS": (
"National Musuem of Natural History"
" - Vertebrate Zoology - Herpetology Division"
),
"NMNHINV": (
"National Musuem of Natural History" " - Invertebrate Zoology Dept."
),
"NMNHMAMMALS": (
"National Musuem of Natural History"
" - Vertebrate Zoology - Mammals Division"
),
"NMNHMINSCI": (
"National Musuem of Natural History" " - Mineral Sciences Dept."
),
"NMNHPALEO": ("National Musuem of Natural History - Paleobiology Dept."),
"NPG": "National Portrait Gallery",
"NPM": "National Postal Museum",
"NZP": "Smithsonian's National Zoo & Conservation Biology Institute",
"OCIO_DPO3D": "OCIO Digital Preservation & 3D Team",
"OFEO-SG": "Office of Facilities Engineering &"
" Operations – Smithsonian Gardens",
"SAAM": "Smithsonian American Art Museum",
"SIA": "Smithsonian Institution Archives",
"SIL": "Smithsonian Libraries",
"SILAF": "Smithsonian Institution Libraries, African Section",
"SILNMAHTL": "Smithsonian Institution Libraries,"
" National Museum of American History, Library",
"SLA_SRO": "Smithsonian Libraries Archives, Special Research/Operations",
}


def parse_arguments():
"""
Expand Down Expand Up @@ -121,14 +185,41 @@ def write_data(args, data_metrics, data_units):
return args


def fetch_unit_codes(session):
LOGGER.info("Fetching current unit codes from Smithsonian API")
url = "https://api.si.edu/openaccess/api/v1.0/terms/unit_code"
params = {"api_key": DATA_GOV_API_KEY}
try:
with session.get(url, params=params) as response:
response.raise_for_status()
api_codes = set(response.json()["response"]["terms"])
except requests.HTTPError as e:
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
except requests.RequestException as e:
raise shared.QuantifyingException(f"Request Exception: {e}", 1)
except KeyError as e:
raise shared.QuantifyingException(f"KeyError: {e}", 1)

map_codes = set(UNIT_MAP.keys())
new_codes = sorted(api_codes - map_codes)
removed_codes = sorted(map_codes - api_codes)

if new_codes:
LOGGER.warning(f"New unit code(s) not in unit_map: {new_codes}")
if removed_codes:
LOGGER.warning(f"unit_map code(s) no longer in API: {removed_codes}")
if not new_codes and not removed_codes:
LOGGER.info("unit_map is up to date")


def query_smithsonian(args, session):
if not DATA_GOV_API_KEY:
raise shared.QuantifyingException(
"Authentication (DATA_GOV_API_KEY) required. Please ensure your"
" API key is set in .env",
1,
)
LOGGER.info("Fetch CC0 metrics and units from units from Smithsonain")
LOGGER.info("Fetch CC0 metrics and units from units from Smithsonian")
url = "https://api.si.edu/openaccess/api/v1.0/stats"
params = {"api_key": DATA_GOV_API_KEY}
try:
Expand Down Expand Up @@ -158,15 +249,16 @@ def query_smithsonian(args, session):
continue
data_units.append(
{
"UNIT": unit["unit"],
"UNIT_CODE": unit["unit"],
"UNIT_NAME": UNIT_MAP.get(unit["unit"], unit["unit"]),
"CC0_RECORDS": unit["metrics"]["CC0_records"],
"CC0_RECORDS_WITH_CC0_MEDIA": unit["metrics"][
"CC0_records_with_CC0_media"
],
"TOTAL_OBJECTS": unit["total_objects"],
}
)
data_units = sorted(data_units, key=itemgetter("UNIT"))
data_units = sorted(data_units, key=itemgetter("UNIT_CODE"))
LOGGER.info(f"Fetched stats for {len(data_units)} units")
return data_metrics, data_units

Expand All @@ -176,6 +268,7 @@ def main():
shared.paths_log(LOGGER, PATHS)
check_for_completion()
session = shared.get_session()
fetch_unit_codes(session)
data_metrics, data_units = query_smithsonian(args, session)
args = write_data(args, data_metrics, data_units)
args = shared.git_add_and_commit(
Expand Down
203 changes: 203 additions & 0 deletions scripts/2-process/smithsonian_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#!/usr/bin/env python
"""
This file is dedicated to processing Smithsonian data
for analysis and comparison between quarters.
"""

# Standard library
import argparse
import os
import sys
import traceback

# Third-party
import pandas as pd

# Add parent directory so shared can be imported
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

# First-party/Local
import shared # noqa: E402

# Setup
LOGGER, PATHS = shared.setup(__file__)

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
FILE_PATHS = [
shared.path_join(PATHS["data_phase"], "smithsonian_totals_by_units.csv"),
shared.path_join(PATHS["data_phase"], "smithsonian_totals_by_records.csv"),
]


def parse_arguments():
"""
Parse command-line options, returns parsed argument namespace.
"""
global QUARTER
LOGGER.info("Parsing command-line options")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--quarter",
default=QUARTER,
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
)
parser.add_argument(
"--enable-save",
action="store_true",
help="Enable saving results (default: False)",
)
parser.add_argument(
"--enable-git",
action="store_true",
help="Enable git actions such as fetch, merge, add, commit, and push"
" (default: False)",
)
parser.add_argument(
"--force",
action="store_true",
help="Regenerate data even if processed files already exist",
)

args = parser.parse_args()
if not args.enable_save and args.enable_git:
parser.error("--enable-git requires --enable-save")
if args.quarter != QUARTER:
global FILE_PATHS, PATHS
FILE_PATHS = shared.paths_list_update(
LOGGER, FILE_PATHS, QUARTER, args.quarter
)
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
QUARTER = args.quarter
args.logger = LOGGER
args.paths = PATHS
return args


def process_totals_by_units(args, count_data):
"""
Processing count data: totals by units
"""
LOGGER.info(process_totals_by_units.__doc__.strip())
data = {}

for row in count_data.itertuples(index=False):
unit = str(row.UNIT_NAME)
total_objects = int(row.TOTAL_OBJECTS)

data[unit] = total_objects

data = pd.DataFrame(data.items(), columns=["Unit_name", "Total_objects"])
data.sort_values("Unit_name", ascending=True, inplace=True)
data.reset_index(drop=True, inplace=True)
file_path = shared.path_join(
PATHS["data_phase"], "smithsonian_totals_by_units.csv"
)
shared.data_to_csv(args, data, file_path)


def process_totals_by_records(args, count_data):
"""
Processing count data: totals by records
"""
LOGGER.info(process_totals_by_records.__doc__.strip())
data = {}

for row in count_data.itertuples(index=False):
unit = str(row.UNIT_NAME)
CC0_records = int(row.CC0_RECORDS)
CC0_records_with_CC0_media = int(row.CC0_RECORDS_WITH_CC0_MEDIA)
total_objects = int(row.TOTAL_OBJECTS)

if unit not in data:
data[unit] = {
"CC0_records": 0,
"CC0_records_with_CC0_media": 0,
"Total_objects": 0,
}

data[unit]["CC0_records"] += CC0_records
data[unit]["CC0_records_with_CC0_media"] += CC0_records_with_CC0_media
data[unit]["Total_objects"] += total_objects

data = (
pd.DataFrame.from_dict(data, orient="index")
.reset_index()
.rename(columns={"index": "Unit_name"})
)
data["CC0_without_media_percentage"] = (
(
(data["CC0_records"] - data["CC0_records_with_CC0_media"])
/ data["Total_objects"]
)
* 100
).round(2)

data["CC0_with_media_percentage"] = (
(data["CC0_records_with_CC0_media"] / data["Total_objects"]) * 100
).round(2)

data["Others_percentage"] = (
((data["Total_objects"] - data["CC0_records"]) / data["Total_objects"])
* 100
).round(2)

data.sort_values("Unit_name", ascending=True, inplace=True)
data.reset_index(drop=True, inplace=True)

file_path = shared.path_join(
PATHS["data_phase"], "smithsonian_totals_by_records.csv"
)
shared.data_to_csv(args, data, file_path)


def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])
shared.check_completion_file_exists(args, FILE_PATHS)
file_count = shared.path_join(
PATHS["data_1-fetch"], "smithsonian_2_units.csv"
)
count_data = shared.open_data_file(
LOGGER,
file_count,
usecols=[
"UNIT_CODE",
"UNIT_NAME",
"CC0_RECORDS",
"CC0_RECORDS_WITH_CC0_MEDIA",
"TOTAL_OBJECTS",
],
)
process_totals_by_units(args, count_data)
process_totals_by_records(args, count_data)

# Push changes
args = shared.git_add_and_commit(
args,
PATHS["repo"],
PATHS["data_quarter"],
f"Add and commit new GitHub data for {QUARTER}",
)
shared.git_push_changes(args, PATHS["repo"])


if __name__ == "__main__":
try:
main()
except shared.QuantifyingException as e:
if e.exit_code == 0:
LOGGER.info(e.message)
else:
LOGGER.error(e.message)
sys.exit(e.exit_code)
except SystemExit as e:
LOGGER.error(f"System exit with code: {e.code}")
sys.exit(e.code)
except KeyboardInterrupt:
LOGGER.info("(130) Halted via KeyboardInterrupt.")
sys.exit(130)
except Exception:
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
sys.exit(1)
Loading