In general, this notebook follows the structure of the our survey instrument. The full text of the survey- as it was presented to study participants- is included as Borghi_VanGulick_PsychRDM_Survey.pdf.
The CSV file containing the data used in this notebook is named Borghi_VanGulick_PsychRDM_Data.csv and is accompanied by a data dictionary (Borghi_VanGulick_PsychRDM_Dictionary.csv).
#Import the packages needed to make this notebook work
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
#Make sure the figures appear in the notebook
%matplotlib inline
#Import the data
df = pd.read_csv("downloads/Data_cleaned.csv")
Description: Before we ask about your specific methods, tools, and data management practices, we have some general questions about you, your lab or research group, and your area of research. The information you provide in this section will help us contextualize your other survey responses.
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_title = pd.DataFrame({"Number": df["title"].value_counts(),
"Percentage": df["title"].value_counts(normalize=True)*100})
#Display text entered by participants who selected "other".
df_title
#Display text entered by participants who selected "other".
df["title_other_text"].value_counts()
#Participants could only give one response to this question.
#Calculate overall descriptive statistics.
print("median ", str(df["years"].median()))
print(df["years"].describe())
#Calculate group statistics grouped by professional title.
df_title_years = df.groupby("title")["years"].describe()
df_title_years["median"] = df.groupby("title")["years"].median()
df_title_years
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_institution_type = pd.DataFrame({"Number": df["institution_type"].value_counts(),
"Percentage": df["institution_type"].value_counts(normalize=True)*100})
df_institution_type
#Display text entered by participants who selected "other".
df["institution_type_other_text"].value_counts()
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_country = pd.DataFrame({"Number": df["country"].value_counts(),
"Percentage": df["country"].value_counts(normalize=True)*100})
df["country"].value_counts()
#Participants gave multiple free responses to this question.
#Create dictionary containing responses.
lab_size_variables = {"lab_size_total": "Total lab size",
"lab_size_ra": "Research assistants",
"lab_size_gs": "Graduate students",
"lab_size_pd": "Postdocs",
"lab_size_fts": "Full time staff",
"lab_size_pts": "Part time staff"}
#Create dataframe containing descriptive statistics.
df_lab_size = df[list(lab_size_variables.keys())].describe().T
df_lab_size["median"] = df[list(lab_size_variables.keys())].median().T
df_lab_size.set_index([list(lab_size_variables.values())], inplace=True)
df_lab_size
#Participants gave a single free text response to this question.
print("median ", str(df["collaboration"].median()))
print(df["collaboration"].describe())
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_area = pd.DataFrame({"Number": df["research_area"].value_counts(),
"Percentage": df["research_area"].value_counts(normalize=True)*100})
df_area
#Display text entered by participants who selected "other".
df["research_area_text"].value_counts()
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_data_type = pd.DataFrame({"Number": df["data_type"].value_counts(),
"Percentage": df["data_type"].value_counts(normalize=True)*100})
df_data_type
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
funder_variables = {"funder_nih": "National Institutes of Health",
"funder_nsf": "National Science Foundation",
"funder_government": "Other government funding",
"funder_private": "Private foundation",
"funder_professional": "Professional organization/society",
"funder_commercial": "Commercial organization",
"funder_internal": "Internal grants (including startup)",
"funder_other": "Other",
"funder_none": "I do not have funding for my work"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_funder = pd.DataFrame({"Number": df[list(funder_variables.keys())].count(),
"Percentage": df[list(funder_variables.keys())].count()/df["funder"].sum()*100})
df_funder.set_index([list(funder_variables.values())], inplace=True)
df_funder
#Display text entered by participants who selected "other".
df["funder_other_text"].value_counts()
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
funder_role_variables = {"funder_role_pi": "Primary Investigator",
"funder_role_ci": "Co-Investigator",
"funder_role_associate": "Faculty associate",
"funder_role_consultant": "Consultant",
"funder_role_pd": "Postdoc",
"funder_role_gs": "Graduate Student",
"funder_role_ug": "Undergraduate Student",
"funder_role_other": "Other",
"funder_role_na": "Not Applicable"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_funder_role = pd.DataFrame({"Number": df[list(funder_role_variables.keys())].count(),
"Percentage": df[list(funder_role_variables.keys())].count()/df["funder"].sum()*100})
df_funder_role.set_index([list(funder_role_variables.values())], inplace=True)
df_funder_role
#Display text entered by participants who selected "other".
df["funder_role_other_text"].value_counts()
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_dmp = pd.DataFrame({"Number": df["dmp"].value_counts(),
"Percentage": df["dmp"].value_counts(normalize=True)*100})
df_dmp
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
education_methods_variables = {"education_methods_courses_school": "Workshops or courses during undergraduate/graduate education",
"education_methods_courses_other": "Workshops or courses not associated with undergraduate/graduate education",
"education_methods_best_practices_psych": "Guidance or best practices created by organizations/experts in psychology",
"education_methods_best_practices_other": "Guidance or best practices created by organizations/experts outside psychology",
"education_methods_person_collab": "From researchers who are in/collaborate with my research group",
"education_methods_person_other": "From researchers who are not in/do not collaborate with my research group",
"education_methods_social_media": "Through social media",
"education_methods_self": "Self education",
"education_methods_none": "I have recieved no training",
"education_methods_other": "Other"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_education_methods = pd.DataFrame({"Number": df[list(education_methods_variables.keys())].count(),
"Percentage": df[list(education_methods_variables.keys())].count()/df["education_method"].sum()*100})
df_education_methods.set_index([list(education_methods_variables.values())], inplace=True)
df_education_methods
#Display text entered by participants who selected "other".
df["education_methods_other_text"].value_counts()
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
education_management_variables = {"education_management_courses_school": "Workshops or courses during undergraduate/graduate education",
"education_management_courses_other": "Workshops or courses not associated with undergraduate/graduate education",
"education_management_best_practices_psych": "Guidance or best practices created by organizations/experts in psychology",
"education_management_best_practices_other": "Guidance or best practices created by organizations/experts outside psychology",
"education_management_person_collab": "From researchers who are in/collaborate with my research group",
"education_management_person_other": "From researchers who are not in/do not collaborate with my research group",
"education_management_social_media": "Through social media",
"education_management_self": "Self education",
"education_management_none": "I have recieved no training",
"education_management_other": "Other"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_education_management = pd.DataFrame({"Number": df[list(education_management_variables.keys())].count(),
"Percentage": df[list(education_management_variables.keys())].count()/df["education_method"].sum()*100})
df_education_management.set_index([list(education_management_variables.values())], inplace=True)
df_education_management
#Display text entered by participants who selected "other".
df["education_management_other_text"].value_counts()
#This question contained multiple parts, participants gave one answer to each.
df_education_methods = pd.DataFrame({"Data Management": df["institution_resource_rdm"].value_counts(normalize=True)*100,
"Data Sharing": df["institution_resource_sharing"].value_counts(normalize=True)*100,
"Insfrastructure (IT)": df["institution_resource_it"].value_counts(normalize=True)*100})
df_education_methods
#This question contained multiple parts, participants gave one answer to each.
#Create dictionary containing responses.
limits_variables = {"limits_time": "The amount of time it takes",
"limits_cost": "The financial cost",
"limits_norms": "Lack of norms or best practices",
"limits_training": "Lack of training",
"limits_incentives": "Lack of professional incentives",
"limits_support": "Lack of institutional support",
"limits_guidance": "Lack of guidance from my PI/collaborators",
"limits_pi": "My supervisor requires I manage my data in a certain way",
"limits_data": "The charactoristics of my data limit what I can do",
"limits_knowldge": "I am unaware of best practices"}
#Create dataframe containing descriptive statistics.
#They're interesting and informative, but remember responses are ordinal.
df_limits_cont= df[list(limits_variables.keys())].describe().T
df_limits_cont["median"]= df[list(limits_variables.keys())].median().T
df_limits_cont.set_index([list(limits_variables.values())], inplace=True)
df_limits_cont
# Create dataframe displaying the percentage of participants who entered each value.
df_limits_cat = df[list(limits_variables.keys())]
df_limits_cat = df_limits_cat.apply(lambda x: pd.value_counts(x, normalize=True))*100
df_limits_cat = df_limits_cat.T
df_limits_cat.set_index([list(limits_variables.values())], inplace=True)
df_limits_cat
#This question contained multiple parts, participants gave one answer to each.
#Create dictionary containing responses.
motivations_variables = {"motivations_loss": "Prevent loss of data",
"motivations_continuity": "Ensure continuity as research team changes",
"motivations_compliance_funding": "Compliance with mandates from funder/publisher",
"motivations_compliance_ethics": "Compliance with legal/ethical frameworks",
"motivations_guidance": "Availability of guidance of best practices",
"motivations_training": "Availability of training",
"motivations_best_practice": "Awareness of best practices",
"motivations_support": "Institutional support",
"motivations_pi": "Guidance from PI/Collaborators",
"motivations_transparency": "Desire to foster research transparency",
"motivations_reproducibility": "Desire to foster reproducibility"}
#Create dataframe containing descriptive statistics.
#They're interesting and informative, but remember responses are ordinal.
df_motivations_cont= df[list(motivations_variables.keys())].describe().T
df_motivations_cont["median"]= df[list(motivations_variables.keys())].median().T
df_motivations_cont["mode"]= df[list(motivations_variables.keys())].mode().T
df_motivations_cont.set_index([list(motivations_variables.values())], inplace=True)
df_motivations_cont
# Create dataframe displaying the percentage of participants who entered each value.
df_motivations_cat = df[list(motivations_variables.keys())]
df_motivations_cat = df_motivations_cat.apply(lambda x: pd.value_counts(x, normalize=True))*100
df_motivations_cat = df_motivations_cat.T
df_motivations_cat.set_index([list(motivations_variables.values())], inplace=True)
df_motivations_cat
Description: The questions in this section concern activities and practices beginning with the collection of raw data from human participants and ending before data are processed and/or analyzed.
#Print descriptive statistics. Interesting and informative, but remember responses are ordinal.
print("median ", str(df["collect_mature_self"].median()))
print(df["collect_mature_self"].describe())
print(df["collect_mature_self"].value_counts())
#Print descriptive statistics. Interesting and informative, but remember responses are ordinal.
print("median ", str(df["collect_mature_field"].median()))
print(df["collect_mature_field"].describe())
print(df["collect_mature_field"].value_counts())
#Print descriptive statistics. Interesting and informative, but remember responses are ordinal.
print("median ", str(df["collect_change"].median()))
print(df["collect_change"].describe())
print(df["collect_change"].value_counts())
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
collect_how_variables = {"collect_how_lab": "Participants come to my lab to participate in an experiment",
"collect_how_travel": "I travel to my participants to collect data",
"collect_how_send": "I send my participants materials, which they return to me",
"collect_how_internet": "I collect data via the internet",
"collect_how_prompt": "I prompt participants to enter data",
"collect_how_secondary": "I examine records or data collected by others.",
"collect_how_other": "Other"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_collect_how = pd.DataFrame({"Number": df[list(collect_how_variables.keys())].count(),
"Percentage": df[list(collect_how_variables.keys())].count()/df["collect_how"].sum()*100})
df_collect_how.set_index([list(collect_how_variables.values())], inplace=True)
df_collect_how
#Display text entered by participants who selected "other".
df["collect_how_other_text"].value_counts()
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_data_type = pd.DataFrame({"Number": df["collect_vulnerable"].value_counts(),
"Percentage": df["collect_vulnerable"].value_counts(normalize=True)*100})
df_data_type
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
collect_software_variables = {"collect_software_eprime":"E-Prime",
"collect_software_builder":"Experiment Builder",
"collect_software_inquisit": "Inquisit",
"collect_software_presentation":"presentation",
"collect_software_psychopy":"psychopy",
"collect_software_matlab": "Matlab",
"collect_software_redcap": "REDcap",
"collect_software_qualtrics": "Qualtrics",
"collect_software_custom": "Custom code",
"collect_software_other": "Other",
"collect_software_none": "I don't use software for this purpose"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_collect_software = pd.DataFrame({"Number": df[list(collect_software_variables.keys())].count(),
"Percentage": df[list(collect_software_variables.keys())].count()/df["collect_software"].sum()*100})
df_collect_software.set_index([list(collect_software_variables.values())], inplace=True)
df_collect_software
#Display text entered by participants who selected "other".
df["collect_software_other_text"].value_counts()
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
collect_data_variables = {"collect_data_av":"Audio/visual recordings",
"collect_data_demographics":"Demographic data",
"collect_data_clinical":"Clinical or Medical data",
"collect_data_scales_quantitative":"Quantitative data from questionnaires",
"collect_data_scales_qualitative":"Qualitative data from questionnaires",
"collect_data_behavioral":"Behavioral data",
"collect_data_qualitative":"Qualitative data",
"collect_data_neurpsych":"Neuropsychological or aptitude tests",
"collect_data_neuroimaging":"Neuroimaging data",
"collect_data_writing":"Data from written documents",
"collect_data_physiology":"Physiological data",
"collect_data_genetic": "Genetic/molecular data",
"collect_data_eye_tracking":"Eye tracking/pupillometry data",
"collect_data_other":"Other"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_collect_data = pd.DataFrame({"Number": df[list(collect_data_variables.keys())].count(),
"Percentage": df[list(collect_data_variables.keys())].count()/df["collect_data"].sum()*100})
df_collect_data.set_index([list(collect_data_variables.values())], inplace=True)
df_collect_data
#Display text entered by participants who selected "other".
df["collect_data_other_text"].value_counts()
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
collect_information_variables={"collect_information_session":"Information about the data collection session",
"collect_information_paradigm":"Research protocol/paradigm-related information",
"collect_information_stimuli":"Research-related stimuli",
"collect_information_text":"Text of questionnaires, scales,etc",
"collect_information_scripts":"Computer code used for data collection",
"collect_information_coding":"Coding materials",
"collect_information_consent":"Informed consent-related documentation",
"collect_information_other":"Other"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
collect_information_data = pd.DataFrame({"Number": df[list(collect_information_variables.keys())].count(),
"Percentage": df[list(collect_information_variables.keys())].count()/df["collect_information"].sum()*100})
collect_information_data.set_index([list(collect_information_variables.values())], inplace=True)
collect_information_data
#Display text entered by participants who selected "other".
df["collect_information_other_text"].value_counts()
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
collect_code_variables={"collect_code_ir":"Institutional repository",
"collect_code_github":"Software-specific hosting service",
"collect_code_osf":"Open Science Framework (OSF)",
"collect_code_repo":"General purpose repository ",
"collect_code_article":"Journal article ",
"collect_code_website":"Website",
"collect_code_other":"Other",
"collect_code_none":"None"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_collect_code = pd.DataFrame({"Number": df[list(collect_code_variables.keys())].count(),
"Percentage": df[list(collect_code_variables.keys())].count()/df["collect_information"].sum()*100})
df_collect_code.set_index([list(collect_code_variables.values())], inplace=True)
df_collect_code
#Display text entered by participants who selected "other".
df["collect_code_other_text"].value_counts()
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
collect_storage_variables={"collect_storage_own_both":"I use my own machine(s) to store and analyze my data",
"collect_storage_own_analysis":"I use my own machine(s) to analyze my data, but I store my data on a shared drive.",
"collect_storage_workstation":"I use a workstation that I share with other researchers to analyze and store my data.",
"collect_storage_server":"I log in to my lab’s shared server or cluster to analyze and store my data.",
"collect_storage_none":"I do not analyze or store my data electronically.",
"collect_storage_other":"Other"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_collect_storage = pd.DataFrame({"Number": df[list(collect_storage_variables.keys())].count(),
"Percentage": df[list(collect_storage_variables.keys())].count()/df["collect_storage"].sum()*100})
df_collect_storage.set_index([list(collect_storage_variables.values())], inplace=True)
df_collect_storage
#Display text entered by participants who selected "other".
df["collect_storage_other_text"].value_counts()
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
collect_organize_variables={"collect_organize_names":"Standardized file naming",
"collect_organize_structures":"Standardized file organization",
"collect_organize_notebook":"Lab notebook, data dictionary, codebook",
"collect_organize_general":"General procedures that aren't standardized or recorded",
"collect_organize_none":"No procedures",
"collect_organize_na":"Not applicable",
"collect_organize_other":"Other"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_collect_organize = pd.DataFrame({"Number": df[list(collect_organize_variables.keys())].count(),
"Percentage": df[list(collect_organize_variables.keys())].count()/df["collect_organize"].sum()*100})
df_collect_organize.set_index([list(collect_organize_variables.values())], inplace=True)
df_collect_organize
#Display text entered by participants who selected "other".
df["collect_organize_other_text"].value_counts()
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_collect_organize_lab = pd.DataFrame({"Number": df["collect_organize_lab"].value_counts(),
"Percentage": df["collect_organize_lab"].value_counts(normalize=True)*100})
df_collect_organize_lab
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
collect_backup_variables={"collect_backup_digitize":"Digitizing non-digital files/data.",
"collect_backup_cabinet":"Storing non-digital data in a secure location",
"collect_backup_hard_drive":"External hard drive",
"collect_backup_backup_manual":"Manually backing up my local machine",
"collect_backup_backup_automatic":"Automatically backing up my local machine",
"collect_backup_server_lab":"Using a lab-owned server",
"collect_backup_server_department":"Local server (Department)",
"collect_backup_server_institution":"Local server (Institution)",
"collect_backup_cloud":"Upload to the cloud",
"collect_backup_ir":"Deposit it to my institutional repository",
"collect_backup_other":"Other",
"collect_backup_none":"I do not back up my files"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_backup_organize = pd.DataFrame({"Number": df[list(collect_backup_variables.keys())].count(),
"Percentage": df[list(collect_backup_variables.keys())].count()/df["collect_backup"].sum()*100})
df_backup_organize.set_index([list(collect_backup_variables.values())], inplace=True)
df_backup_organize
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_collect_backup_lab = pd.DataFrame({"Number": df["collect_backup_lab"].value_counts(),
"Percentage": df["collect_backup_lab"].value_counts(normalize=True)*100})
df_collect_backup_lab
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_collect_backup_copy = pd.DataFrame({"Number": df["collect_backup_copy"].value_counts(),
"Percentage": df["collect_backup_copy"].value_counts(normalize=True)*100})
df_collect_backup_copy
#This question contained multiple parts, participants gave one answer to each.
#Create dictionary containing responses.
collect_education_variables={"collect_education_dmp":"Completing a data management plan (DMP)",
"collect_education_storing":"Best practices for storing and backing up data.",
"collect_education_security":"Ensuring the security of sensitive data ",
"collect_education_organizing":"Organizing data",
"collect_education_documenting":"Documenting data"}
#Create dataframe containing descriptive statistics.
#They're interesting and informative, but remember responses are ordinal.
df_collect_education_cont= df[list(collect_education_variables.keys())].describe().T
df_collect_education_cont["median"]= df[list(collect_education_variables.keys())].median().T
df_collect_education_cont.set_index([list(collect_education_variables.values())], inplace=True)
df_collect_education_cont
# Create dataframe displaying the percentage of participants who entered each value.
df_collect_education_cont = df[list(collect_education_variables.keys())]
df_collect_education_cont = df_collect_education_cont.apply(lambda x: pd.value_counts(x, normalize=True))*100
df_collect_education_cont = df_collect_education_cont.T
df_collect_education_cont.set_index([list(collect_education_variables.values())], inplace=True)
df_collect_education_cont
#Create a stacked bar chart to display the data.
with sns.color_palette("Greens"):
bar_education_collect_stacked = df_collect_education_cont.plot(kind='barh', stacked=True, legend=False)
#Clean up the formatting
bar_education_collect_stacked.invert_yaxis()
bar_education_collect_stacked.set(xlim=(0, 100))
plt.xlabel("Percentage")
sns.despine(offset=10)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.axvline(25, color="k", linestyle="--");
plt.axvline(50, color="k", linestyle="--");
plt.axvline(75, color="k", linestyle="--");
Description: The questions in this section concern activities and practices starting when data is processed, cleaned, or inspected, continuing through the application of descriptive and/or inferential statistics, and ending before the data is made available or described in a presentation or scholarly publication.
#Print descriptive statistics. Interesting and informative, but remember responses are ordinal.
print("median ", str(df["analyze_mature_self"].median()))
print(df["analyze_mature_self"].describe())
print(df["analyze_mature_self"].value_counts())
#Print descriptive statistics. Interesting and informative, but remember responses are ordinal.
print("median ", str(df["analyze_mature_field"].median()))
print(df["analyze_mature_field"].describe())
print(df["analyze_mature_field"].value_counts())
#Print descriptive statistics. Interesting and informative, but remember responses are ordinal.
print("median ", str(df["analyze_change"].median()))
print(df["analyze_change"].describe())
print(df["analyze_change"].value_counts())
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
analyze_tools_taught_variables={"analyze_tools_taught_excel":"Excel",
"analyze_tools_taught_jasp":"JASP",
"analyze_tools_taught_jamovi":"jamovi",
"analyze_tools_taught_lisrel":"Lisrel",
"analyze_tools_taught_literate":"Literate programming tools",
"analyze_tools_taught_matlab":"MATLAB",
"analyze_tools_taught_mplus":"MPlus",
"analyze_tools_taught_python":"python",
"analyze_tools_taught_sas":"SAS",
"analyze_tools_taught_spss":"SPSS",
"analyze_tools_taught_stata":"STATA",
"analyze_tools_taught_systat":"SYSTAT",
"analyze_tools_taught_r":"R",
"analyze_tools_taught_other":"Other",
"analyze_tools_taught_no_tools":"I was not taught to use software tools",
"analyze_tools_taught_no_training":"I have recieved no formal training"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_analyze_tools_taught = pd.DataFrame({"Number": df[list(analyze_tools_taught_variables.keys())].count(),
"Percentage": df[list(analyze_tools_taught_variables.keys())].count()/df["analyze_tools_taught"].sum()*100})
df_analyze_tools_taught.set_index([list(analyze_tools_taught_variables.values())], inplace=True)
df_analyze_tools_taught
#Display text entered by participants who selected "other".
df["analyze_tools_taught_other_text"].value_counts()
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
analyze_tools_use_variables={"analyze_tools_use_excel":"Excel",
"analyze_tools_use_jasp":"JASP",
"analyze_tools_use_jamovi":"jamovi",
"analyze_tools_use_lisrel":"Lisrel",
"analyze_tools_use_literate":"Literate programming tools",
"analyze_tools_use_matlab":"MATLAB",
"analyze_tools_use_mplus":"MPlus",
"analyze_tools_use_python":"python",
"analyze_tools_use_sas":"SAS",
"analyze_tools_use_spss":"SPSS",
"analyze_tools_use_stata":"STATA",
"analyze_tools_use_systat":"SYSTAT",
"analyze_tools_use_r":"R",
"analyze_tools_use_other":"Other",
"analyze_tools_use_no_tools":"I was not taught to use software tools"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_analyze_tools_use = pd.DataFrame({"Number": df[list(analyze_tools_use_variables.keys())].count(),
"Percentage": df[list(analyze_tools_use_variables.keys())].count()/df["analyze_tools_use"].sum()*100})
df_analyze_tools_use.set_index([list(analyze_tools_use_variables.values())], inplace=True)
df_analyze_tools_use
#Display text entered by participants who selected "other".
df["analyze_tools_use_other_text"].value_counts()
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_analyze_tools_lab = pd.DataFrame({"Number": df["analyze_tools_lab"].value_counts(),
"Percentage": df["analyze_tools_lab"].value_counts(normalize=True)*100})
df_analyze_tools_lab
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_analyze_tools_vc = pd.DataFrame({"Number": df["analyze_tools_vc"].value_counts(),
"Percentage": df["analyze_tools_vc"].value_counts(normalize=True)*100})
df_analyze_tools_vc
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_analyze_code = pd.DataFrame({"Number": df["analyze_code"].value_counts(),
"Percentage": df["analyze_code"].value_counts(normalize=True)*100})
df_analyze_code
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
analyze_code_shared_variables={"analyze_code_shared_ir":"Yes. In an institutional repository",
"analyze_code_shared_github":"Yes. Using a software specific repository",
"analyze_code_shared_osf":"Yes. Using the Open Science Framework",
"analyze_code_shared_repo":"Yes. Using a general purpose repository",
"analyze_code_shared_article":"Yes. As part of a journal article",
"analyze_code_shared_website":"Yes. On a lab or project website",
"analyze_code_shared_other":"Yes. Other.",
"analyze_code_shared_none":"No"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_analyze_code_shared = pd.DataFrame({"Number": df[list(analyze_code_shared_variables.keys())].count(),
"Percentage": df[list(analyze_code_shared_variables.keys())].count()/df["analyze_code_shared"].sum()*100})
df_analyze_code_shared.set_index([list(analyze_code_shared_variables.values())], inplace=True)
df_analyze_code_shared
#Display text entered by participants who selected "other".
df["analyze_code_shared_other_text"].value_counts()
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
analyze_docs_variables={"analyze_docs_paper":"Physical notebook or on paper",
"analyze_docs_word":"word processing or note-taking program ",
"analyze_docs_project_management":"collaborative project management system ",
"analyze_docs_eln":"electronic lab notebook",
"analyze_docs_literate":"literate programming tools",
"analyze_docs_vc":"version control system",
"analyze_docs_wiki":"lab wiki",
"analyze_docs_readme":"ReadMe files",
"analyze_docs_none":"I do not document my activities in any systematic way.",
"analyze_docs_other":"Other"}
df_analyze_docs = pd.DataFrame({"Number": df[list(analyze_docs_variables.keys())].count(),
"Percentage": df[list(analyze_docs_variables.keys())].count()/df["analyze_docs"].sum()*100})
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_analyze_docs.set_index([list(analyze_docs_variables.values())], inplace=True)
df_analyze_docs
#Display text entered by participants who selected "other".
df["analyze_docs_other_text"].value_counts()
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_analyze_docs_lab = pd.DataFrame({"Number": df["analyze_docs_lab"].value_counts(),
"Percentage": df["analyze_docs_lab"].value_counts(normalize=True)*100})
df_analyze_docs_lab
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_analyze_docs_others = pd.DataFrame({"Number": df["analyze_docs_others"].value_counts(),
"Percentage": df["analyze_docs_others"].value_counts(normalize=True)*100})
df_analyze_docs_others
#This question contained multiple parts, participants gave one answer to each.
#Create dictionary containing responses.
analyze_education_variables={"analyze_education_methods":"Research design/methods",
"analyze_education_tools":"Use of software tools",
"analyze_education_organizing":"Organizing data for analysis",
"analyze_education_documenting":"Documenting analysis decisions",
"analyze_education_sharing":"Sharing code in a useful manner"}
#Create dataframe containing descriptive statistics.
#They're interesting and informative, but remember responses are ordinal.
df_analyze_education_cont= df[list(analyze_education_variables.keys())].describe().T
df_analyze_education_cont["median"]= df[list(analyze_education_variables.keys())].median().T
df_analyze_education_cont.set_index([list(analyze_education_variables.values())], inplace=True)
df_analyze_education_cont
# Create dataframe displaying the percentage of participants who entered each value.
df_analyze_education_cat = df[list(analyze_education_variables.keys())]
df_analyze_education_cat = df_analyze_education_cat.apply(lambda x: pd.value_counts(x, normalize=True))*100
df_analyze_education_cat = df_analyze_education_cat.T
df_analyze_education_cat.set_index([list(analyze_education_variables.values())], inplace=True)
df_analyze_education_cat
#Create the figure
with sns.color_palette("Greens"):
bar_analyze_collect_stacked = df_analyze_education_cat.plot(kind='barh', stacked=True, legend=False)
#Clean up the formatting
bar_analyze_collect_stacked.invert_yaxis()
bar_analyze_collect_stacked.set(xlim=(0, 100))
plt.xlabel("Percentage")
sns.despine(offset=10)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.axvline(25, color="k", linestyle="--");
plt.axvline(50, color="k", linestyle="--");
plt.axvline(75, color="k", linestyle="--");
Description The questions in this section concern activities and practices related to the communication or publication of your research results in a presentation or scholarly publication or the sharing of your data via a general or discipline-specific repository (e.g. Figshare, Dryad, Zenodo, ICPSR).
#Print descriptive statistics. Interesting and informative, but remember responses are ordinal.
print("median ", str(df["share_mature_self"].median()))
print(df["share_mature_self"].describe())
print(df["share_mature_self"].value_counts())
#Print descriptive statistics. Interesting and informative, but remember responses are ordinal.
print("median ", str(df["share_mature_field"].median()))
print(df["share_mature_field"].describe())
print(df["share_mature_field"].value_counts())
#Print descriptive statistics. Interesting and informative, but remember responses are ordinal.
print("median ", str(df["share_change"].median()))
print(df["share_change"].describe())
print(df["share_change"].value_counts())
share_cant_variables = {"share_cant_publish":"Yes. My data contains additional findings I wish to discover/publish",
"share_cant_sensitive":"Yes, my data contains confidential or sensitive information.",
"share_cant_irb":"Yes, I have not received institutional review board approval to share my data.",
"share_cant_format":"Yes, my data is in a format that makes it difficult to share with others.",
"share_cant_ip":"Yes, my data is proprietary or subject to intellectual property concerns",
"share_cant_pi":"Yes, my supervisor/collaborators do not wish to share the data.",
"share_cant_time":"Yes, it would take too much time or effort for me to share my data.",
"share_cant_knowledge":"Yes, I do not know how to share my data.",
"share_cant_other":"Yes. Other ",
"share_cant_no_authorship":"No, but I request authorship if others use my data",
"share_cant_no_citation":"No, but I request citation or acknowledgement if others use my data."}
df_share_cant = pd.DataFrame({"Number": df[list(share_cant_variables.keys())].count(),
"Percentage": df[list(share_cant_variables.keys())].count()/df["share_cant"].sum()*100})
df_share_cant.set_index([list(share_cant_variables.values())], inplace=True)
df_share_cant
#Display text entered by participants who selected "other".
df["share_cant_other_text"].value_counts()
share_archive_variables={"share_archive_article":"Yes, I have published my data as part of a journal article",
"share_archive_repo_government":"Yes, I have deposited my data in a government or funder sponsored repository",
"share_archive_OSF":"Yes, I have shared my data using the Open Science Framework (OSF)",
"share_archive_repo_other":"Yes, I have deposited my data into a general purpose repository besides the OSF",
"share_archive_repo_disclipline":"Yes, I have deposited my data in a discipline-specific repository",
"share_archive_repo_ir":"Yes, I have deposited my data in my institutional repository",
"share_archive_other":"Yes. Other",
"share_archive_none":"No"}
df_share_archive = pd.DataFrame({"Number": df[list(share_archive_variables.keys())].count(),
"Percentage": df[list(share_archive_variables.keys())].count()/df["share_archive"].sum()*100})
df_share_archive.set_index([list(share_archive_variables.values())], inplace=True)
df_share_archive
#Display text entered by participants who selected "other".
df["share_archive_other_text"].value_counts()
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
share_motivation_variables={"share_motivation_results":"To communicate my results and/or add to the scholarly literature",
"share_motivation_validity":"To allow other researchers to assess the validity of my conclusions",
"share_motivation_incentives":"Professional incentives",
"share_motivation_ip":"To establish intellectual property or patent claims.",
"share_motivation_mandate":"It is mandated by a funder, publisher, or my institution.",
"share_motivation_transparency":"To foster transparency and reproducibility.",
"share_motivation_reuse":"To foster re-use",
"share_motivation_other":"Other",
"share_motivation_na":"Not applicable, I do not share my data in this manner."}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_share_motivation = pd.DataFrame({"Number": df[list(share_motivation_variables.keys())].count(),
"Percentage": df[list(share_motivation_variables.keys())].count()/df["share_motivation"].sum()*100})
df_share_motivation.set_index([list(share_motivation_variables.values())], inplace=True)
df_share_motivation
#Display text entered by participants who selected "other".
df["share_motivation_other_text"].value_counts()
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_share_publisher = pd.DataFrame({"Number": df["share_publisher"].value_counts(),
"Percentage": df["share_publisher"].value_counts(normalize=True)*100})
df_share_publisher
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_share_request = pd.DataFrame({"Number": df["share_request"].value_counts(),
"Percentage": df["share_request"].value_counts(normalize=True)*100})
df_share_request
print("median ", str(df["share_request_usable"].median()))
print(df["share_request_usable"].describe())
print(df["share_request_usable"].value_counts())
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_share_receive = pd.DataFrame({"Number": df["share_receive"].value_counts(),
"Percentage": df["share_receive"].value_counts(normalize=True)*100})
df_share_receive
print("median ", str(df["share_receive_usable"].median()))
print(df["share_receive_usable"].describe())
print(df["share_receive_usable"].value_counts())
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
share_use_variables={"share_use_replicate":"To verify or replicate their results.",
"share_use_meta":"As part of completing a meta-analysis.",
"share_use_extend":"To extend conclusions drawn from it or test alternative hypotheses.",
"share_use_test":"To learn a new technique, method, or tool.",
"share_use_none":"I did not end up using it.",
"share_use_other":"Other",
"share_use_na":"Not applicable."}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_share_use = pd.DataFrame({"Number": df[list(share_use_variables.keys())].count(),
"Percentage": df[list(share_use_variables.keys())].count()/df["share_use"].sum()*100})
df_share_use.set_index([list(share_use_variables.values())], inplace=True)
df_share_use
#Display text entered by participants who selected "other".
df["share_use_other_text"].value_counts()
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_share_docs_others = pd.DataFrame({"Number": df["share_docs_others"].value_counts(),
"Percentage": df["share_docs_others"].value_counts(normalize=True)*100})
df_share_docs_others
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_share_preserve_time = pd.DataFrame({"Number": df["share_preserve_time"].value_counts(),
"Percentage": df["share_preserve_time"].value_counts(normalize=True)*100})
df_share_preserve_time
#Display text entered by participants who selected "other".
df["share_preserve_time_text"].value_counts()
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
share_preserve_variables={"share_preserve_what_av":"Audio/visual recordings",
"share_preserve_what_demographic":"Demographic data ",
"share_preserve_what_medical":"Clinical or Medical data",
"share_preserve_what_scales_quantitative":"Quantitative data from questionnaires",
"share_preserve_what_scales_qualitative":"Qualitative data from questionnaires",
"share_preserve_what_behavioral":"Behavioral data",
"share_preserve_what_qualitative":"Qualitative data",
"share_preserve_what_neurphysiological":"Data from neuropsychological or aptitude tests ",
"share_preserve_what_neuroimaging":"Neuroimaging data",
"share_preserve_what_writing":"Data from written documents",
"share_preserve_what_physiological":"Physiological data",
"share_preserve_what_genetic":"Genetic/molecular data",
"share_preserve_what_eye_tracking":"Eye tracking/pupillometry data ",
"share_preserve_what_session":"Information about the data collection session ",
"share_preserve_what_paradigm":"Task-related information",
"share_preserve_what_stimuli":"Task-related stimuli",
"share_preserve_what_code_collection":"Computer code used for data collection ",
"share_preserve_what_schemes":"Coding materials",
"share_preserve_what_consent":"Informed consent-related documentation",
"share_preserve_what_other":"Other"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_share_preserve = pd.DataFrame({"Number": df[list(share_preserve_variables.keys())].count(),
"Percentage": df[list(share_preserve_variables.keys())].count()/df["share_preserve_what"].sum()*100})
df_share_preserve.set_index([list(share_preserve_variables.values())], inplace=True)
df_share_preserve
#Display text entered by participants who selected "other".
df["share_preserve_what_other_text"].value_counts()
#This question contained multiple parts, participants gave one answer to each.
#Create dictionary containing responses.
sharing_education_variables={"share_education_useful":"Sharing data in a form that ensures it will be useful to others",
"share_education_platforms":"Use of different platforms, repositories, or tools for sharing data ",
"share_education_confidentiality":"Protecting participant confidentiality in shared data.",
"share_education_reuse":"Understanding reuse rights related to data",
"share_education_archiving":"Best practices for preserving or archiving data over the long term."}
#Create dataframe containing descriptive statistics.
#They're interesting and informative, but remember responses are ordinal.
df_sharing_education_cont= df[list(sharing_education_variables.keys())].describe().T
df_sharing_education_cont["median"]= df[list(sharing_education_variables.keys())].median().T
df_sharing_education_cont.set_index([list(sharing_education_variables.values())], inplace=True)
df_sharing_education_cont
df_sharing_education_cat = df[list(sharing_education_variables.keys())]
df_sharing_education_cat = df_sharing_education_cat.apply(lambda x: pd.value_counts(x, normalize=True))*100
df_sharing_education_cat = df_sharing_education_cat.T
df_sharing_education_cat.set_index([list(sharing_education_variables.values())], inplace=True)
df_sharing_education_cat
#Create a stacked bar chart to display the data.
with sns.color_palette("Greens"):
bar_education_collect_stacked = df_sharing_education_cat.plot(kind='barh', stacked=True, legend=False)
#Clean up the formatting
bar_education_collect_stacked.invert_yaxis()
bar_education_collect_stacked.set(xlim=(0, 100))
plt.xlabel("Percentage")
sns.despine(offset=10)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.axvline(25, color="k", linestyle="--");
plt.axvline(50, color="k", linestyle="--");
plt.axvline(75, color="k", linestyle="--");
Description: The questions in this section concern activities, practices, and plans related to new (or newly visible) ways of communicating, disseminating, or sharing material related to your research.
#Participants could select multiple responses for this question.
#Create dictionary containing responses.
sc_publish_variables={"sc_publish_communicate":"To communicate my results and/or add to the scholarly literature",
"sc_publish_validity":"To allow other researchers to assess the validity of my conclusions.",
"sc_publish_incentives":"Professional incentives (e.g. authorship or citations are required for promotion)",
"sc_publish_ip":"To establish intellectual property or patent claims.",
"sc_publish_funder":"It is expected by my funding agency.",
"sc_publish_employer":"It is expected by my employer.",
"sc_publish_other":"Other"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df_sc_publish = pd.DataFrame({"Number": df[list(sc_publish_variables.keys())].count(),
"Percentage": df[list(sc_publish_variables.keys())].count()/df["share_use"].sum()*100})
df_sc_publish.set_index([list(sc_publish_variables.values())], inplace=True)
df_sc_publish
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_sc_limited = pd.DataFrame({"Number": df["sc_limited"].value_counts(),
"Percentage": df["sc_limited"].value_counts(normalize=True)*100})
df_sc_limited
#Participants could only give one response to this question.
#Create dataframe containing both number of responses and normalized responses (percentage).
df_sc_firstclass = pd.DataFrame({"Number": df["sc_firstclass"].value_counts(),
"Percentage": df["sc_firstclass"].value_counts(normalize=True)*100})
df_sc_firstclass
#This question contained multiple parts, participants gave one answer to each.
#Create dictionary containing responses.
sc_activities_current={"sc_activities_current_preprint":"Publish a preprint",
"sc_activities_current_oa_gold":"Publish in an open access journal",
"sc_activities_current_oa_green":"Deposit an author’s accepted manuscript",
"sc_activities_current_cite_data":"Cite a dataset",
"sc_activities_current_data_paper":"Cite code or software",
"sc_activities_current_cite_code":"Publish a data paper or publish a dataset",
"sc_activities_current_data_mediated":"Make data available, but only to researchers with appropriate credentials",
"sc_activities_current_share_materials":"Share or publish other research materials",
"sc_activities_current_share_protocol":"Share or publish a study protocol",
"sc_activities_current_preregister":"Pre-register a study",
"sc_activities_current_register_report":"Submitting a registered report",
"sc_activities_current_curation":"Take advantage of a data curation or research data management service",
"sc_activities_current_replication":"Publish a direct replication of a previously published study"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df2 = df[list(sc_activities_current.keys())]
df2 = df2.apply(lambda x: pd.value_counts(x, normalize=True))*100
df2 = df2.T
df2.set_index([list(sc_activities_current.values())], inplace=True)
df2
#This question contained multiple parts, participants gave one answer to each.
#Create dictionary containing responses.
sc_activities_future={"sc_activities_future_preprint":"Publish a preprint",
"sc_activities_future_oa_gold":"Publish in an open access journal",
"sc_activities_future_oa_green":"Deposit an author’s accepted manuscript",
"sc_activities_future_cite_data":"Cite a dataset",
"sc_activities_future_data_paper":"Cite code or software",
"sc_activities_future_cite_code":"Publish a data paper or publish a dataset",
"sc_activities_future_data_mediated":"Make data available, but only to researchers with appropriate credentials",
"sc_activities_future_share_materials":"Share or publish other research materials",
"sc_activities_future_share_protocol":"Share or publish a study protocol",
"sc_activities_future_preregister":"Pre-register a study",
"sc_activities_future_register_report":"Submitting a registered report",
"sc_activities_future_curation":"Take advantage of a data curation or research data management service",
"sc_activities_future_replication":"Publish a direct replication of a previously published study"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df3 = df[list(sc_activities_future.keys())]
df3 = df3.apply(lambda x: pd.value_counts(x, normalize=True))*100
df3 = df3.T
df3.set_index([list(sc_activities_future.values())], inplace=True)
df3
#This question contained multiple parts, participants gave one answer to each.
#Create dictionary containing responses.
sc_education_variables={"sc_education_oa":"Open access publishing",
"sc_education_preregister":"Preregistering studies",
"sc_education_tools":"Using open science software tools",
"sc_education_practices":"Applying open science practices",
"sc_education_datasets":"Finding and using openly available datasets"}
#Create dataframe containing descriptive statistics.
#They're interesting and informative, but remember responses are ordinal.
df_sharing_education_cont= df[list(sc_education_variables.keys())].describe().T
df_sharing_education_cont["median"]= df[list(sc_education_variables.keys())].median().T
df_sharing_education_cont.set_index([list(sc_education_variables.values())], inplace=True)
df_sharing_education_cont
# Create dataframe displaying the percentage of participants who entered each value.
df_sc_education_cat = df[list(sc_education_variables.keys())]
df_sc_education_cat = df_sc_education_cat.apply(lambda x: pd.value_counts(x, normalize=True))*100
df_sc_education_cat = df_sc_education_cat.T
df_sc_education_cat.set_index([list(sc_education_variables.values())], inplace=True)
df_sc_education_cat
#Create a stacked bar chart to display the data.
with sns.color_palette("Greens"):
sc_stacked = df_sharing_education_cat.plot(kind='barh', stacked=True, legend=False)
#Clean up the formatting
sc_stacked.invert_yaxis()
sc_stacked.set(xlim=(0, 100))
plt.xlabel("Percentage")
sns.despine(offset=10)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.axvline(25, color="k", linestyle="--");
plt.axvline(50, color="k", linestyle="--");
plt.axvline(75, color="k", linestyle="--");
stats.mannwhitneyu(df['collect_mature_self'],df['collect_mature_field'])
stats.mannwhitneyu(df['analyze_mature_self'],df['analyze_mature_field'])
stats.mannwhitneyu(df['share_mature_self'],df['share_mature_field'])
stats.kruskal(df['collect_mature_self'].dropna(),df['analyze_mature_self'].dropna(),df['share_mature_self'].dropna())
stats.kruskal(df['collect_mature_field'].dropna(),df['analyze_mature_field'].dropna(),df['share_mature_field'].dropna())
stats.kruskal(df['collect_change'].dropna(),df['analyze_change'].dropna(),df['share_change'].dropna())
stats.mannwhitneyu(df["share_receive_usable"],df["share_request_usable"])
#This question contained multiple parts, participants gave one answer to each.
#Create dictionary containing responses.
limits_variables = {"limits_time": "The amount of time it takes",
"limits_incentives": "Lack of professional incentives",
"limits_training": "Lack of training",
"limits_norms": "Lack of norms or best practices",
"limits_support": "Lack of institutional support",
"limits_knowldge": "I am unaware of best practices",
"limits_guidance": "Lack of guidance from my PI/collaborators",
"limits_data": "The characteristics of my data limit what I can do",
"limits_cost": "The financial cost",
"limits_pi": "Requirements of PI/collaborators"}
# Create dataframe displaying the percentage of participants who entered each value.
df_limits_cat = df[list(limits_variables.keys())]
df_limits_cat = df_limits_cat.apply(lambda x: pd.value_counts(x, normalize=True))*100
df_limits_cat = df_limits_cat.T
df_limits_cat.set_index([list(limits_variables.values())], inplace=True)
#Create a stacked bar chart to display the data.
bar_limits_stacked = df_limits_cat.plot(kind='barh', stacked=True, legend=False, color=["#dad7cb", '#b6b1a9', "#8f1425","#928b81","#5f574f"])
bar_limits_stacked.invert_yaxis()
bar_limits_stacked.set(xlim=(0, 100))
plt.xlabel("Percentage")
bar_limits_stacked.get_xaxis().set_ticks([0,25,50,75,100])
sns.despine(offset=10)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.axvline(25, color="k", linestyle="--");
plt.axvline(50, color="k", linestyle="--");
plt.axvline(75, color="k", linestyle="--");
plt.savefig("Desktop/psych_limits.png", format='png', dpi=1000, bbox_inches="tight")
#This question contained multiple parts, participants gave one answer to each.
#Create dictionary containing responses.
motivations_variables = {"motivations_loss": "Prevent loss of data",
"motivations_reproducibility": "Desire to foster reproducibility",
"motivations_transparency": "Desire to foster research transparency",
"motivations_continuity": "Ensure continuity as research team changes",
"motivations_compliance_ethics": "Compliance with legal/ethical frameworks",
"motivations_best_practice": "Awareness of best practices",
"motivations_compliance_funding": "Compliance with mandates from funder/publisher",
"motivations_guidance": "Availability of guidance of best practices",
"motivations_pi": "Guidance from PI/Collaborators",
"motivations_training": "Availability of training",
"motivations_support": "Institutional support"}
# Create dataframe displaying the percentage of participants who entered each value.
df_motivations_cat = df[list(motivations_variables.keys())]
df_motivations_cat = df_motivations_cat.apply(lambda x: pd.value_counts(x, normalize=True))*100
df_motivations_cat = df_motivations_cat.T
df_motivations_cat.set_index([list(motivations_variables.values())], inplace=True)
bar_motivations_stacked = df_motivations_cat.plot(kind='barh', stacked=True, legend=False, color=["#dad7cb", '#b6b1a9', "#14628f","#928b81","#5f574f"])
#Clean up the formatting
bar_motivations_stacked.invert_yaxis()
bar_motivations_stacked.set(xlim=(0, 100))
plt.xlabel("Percentage")
bar_motivations_stacked.get_xaxis().set_ticks([0,25,50,75,100])
bar_motivations_stacked.get_xaxis().set_ticks([0,25,50,75,100])
sns.despine(offset=10)
plt.legend(bbox_to_anchor=(1, 1), loc=2
)
plt.axvline(25, color="k", linestyle="--");
plt.axvline(50, color="k", linestyle="--");
plt.axvline(75, color="k", linestyle="--");
plt.savefig("Desktop/psych_motivations.png", format='png', dpi=1000, bbox_inches="tight")
#This question contained multiple parts, participants gave one answer to each.
#Create dictionary containing responses.
sc_activities_current={"sc_activities_current_data_paper":"Publish a data paper or publish a dataset",
"sc_activities_current_register_report":"Submitting a registered report",
"sc_activities_current_curation":"Take advantage of a data curation or research data management service",
"sc_activities_current_replication":"Publish a direct replication of a previously published study",
"sc_activities_current_data_mediated":"Make data available, but only to researchers with appropriate credentials",
"sc_activities_current_share_protocol":"Share or publish a study protocol",
"sc_activities_current_cite_data":"Cite a dataset",
"sc_activities_current_preprint":"Publish a preprint",
"sc_activities_current_share_materials":"Share or publish other research materials",
"sc_activities_current_preregister":"Pre-register a study",
"sc_activities_current_oa_green":"Deposit an author’s accepted manuscript",
"sc_activities_current_cite_code":"Cite code or software",
"sc_activities_current_oa_gold":"Publish in an open access journal"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df2 = df[list(sc_activities_current.keys())]
df2 = df2.apply(lambda x: pd.value_counts(x, normalize=True))*100
df2 = df2.T
df2.set_index([list(sc_activities_current.values())], inplace=True)
df2
#This question contained multiple parts, participants gave one answer to each.
#Create dictionary containing responses.
sc_activities_future={ "sc_activities_future_data_paper":"Publish a data paper or publish a dataset",
"sc_activities_future_register_report":"Submitting a registered report",
"sc_activities_future_curation":"Take advantage of a data curation or research data management service",
"sc_activities_future_replication":"Publish a direct replication of a previously published study",
"sc_activities_future_data_mediated":"Make data available, but only to researchers with appropriate credentials",
"sc_activities_future_share_protocol":"Share or publish a study protocol",
"sc_activities_future_cite_data":"Cite a dataset",
"sc_activities_future_preprint":"Publish a preprint",
"sc_activities_future_share_materials":"Share or publish other research materials",
"sc_activities_future_preregister":"Pre-register a study",
"sc_activities_future_oa_green":"Deposit an author’s accepted manuscript",
"sc_activities_future_cite_code":"Cite code or software",
"sc_activities_future_oa_gold":"Publish in an open access journal"}
#Create dataframe containing both the number and percentage of responding participants who selected each response.
df3 = df[list(sc_activities_future.keys())]
df3 = df3.apply(lambda x: pd.value_counts(x, normalize=True))*100
df3 = df3.T
df3.set_index([list(sc_activities_future.values())], inplace=True)
df3
#Create a figure to compare "yes" responses.
df4 = pd.DataFrame({"Current":df2["Yes"],
"Future":df3["Yes"]})
with sns.color_palette("tab10"):
a_stacked = df4.plot(kind='barh', legend=False)
#Clean up the formatting
a_stacked.invert_yaxis()
a_stacked.set(xlim=(0, 100))
plt.xlabel("Percentage")
sns.despine(offset=10)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.savefig("Desktop/psych_future.png", format='png', dpi=1000, bbox_inches="tight")