From 67523501e0c5487c45259be554cf9b80aa8fe47d Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent@modolo.fr> Date: Tue, 17 Mar 2020 17:07:18 +0100 Subject: [PATCH] cleanup presentation.tex draft --- presentation.tex | 536 ----------------------------------------------- 1 file changed, 536 deletions(-) delete mode 100644 presentation.tex diff --git a/presentation.tex b/presentation.tex deleted file mode 100644 index 5ecea04..0000000 --- a/presentation.tex +++ /dev/null @@ -1,536 +0,0 @@ -\documentclass{beamer} - -\begin{document} - -\begin{frame} - \frametitle{Good Enough Practices in Scientific Computing} - \framesubtitle{Data Management} - \begin{itemize} - \item Save the raw data - \only<1>{ - \begin{itemize} - \item re-running analysis from start to finish - \item recovery from mishaps - \item exprerimenting without fear - \end{itemize} - } - - \item Create data you wish to see in the world - \only<2>{ - \begin{itemize} - \item improve machine and human readability - \item open, non-proprietary formats ensure machine readability across - time and computing setups (CSV, JSON, \dots) - \item use self-explaining variable names (\texttt{untreated} vs \texttt{treated}) - \item use \texttt{NA} for missing data - \item useful metadata can be used as part of the filename (date, location, \dots). - \end{itemize} - } - - \item Create analysis-friendly data - \only<3>{ - \begin{itemize} - \item Make each column a variable - \item Make each row an observation - \end{itemize} - } - - \item Record all the steps used to precess data - \only<4>{ - Otherwise it's impossible for you or anyone else to repeat the analysis - \begin{itemize} - \item Write scripts for every stage of data processing - \item When scripting is not feasible: document every manual action - \end{itemize} - } - - \item Anticipate the need to use multiple tables - \only<4>{ - Use subject id represented in a common format accross the table - } - - \item Submit data to a reputable DOI-issuing repository so that other can - access and cite it. - \only<4>{ - Your data is as much a product of your research as the papers you write. - } - \end{itemize} -\end{frame} - -\begin{frame} - \frametitle{Good Enough Practices in Scientific Computing} - \framesubtitle{Software} - \begin{itemize} - \item Place a bried explanatory comment at the start of every proram - \only<1>{ - It should include at least one example of how the program is used - } - - \item Decompose programs into functions - \only<2>{ - We are limited, break things into chunks that we can understand - \begin{itemize} - \item no more that 60 lines - \item do not use global variables - \item less than 7 parameters - \end{itemize} - } - - \item Eliminate duplication - \only<3>{ - \begin{itemize} - \item Write re-use functions instead of copying code - \item Use data structure instead of a lots of variables - \item Use existing and well-maintained software libraries - \item and test them! - \end{itemize} - } - - \item Give functions and variables meaningful names - \only<4>{ - \begin{itemize} - \item The reated it's scope, the more informative it's name should be - \item With tab completion, you have no escuses - \end{itemize} - } - - \item Make dependencies and requirements explicit - \only<4>{ - This is the ``Getting started'' setion of the README file - } - - \item Do not comment / uncomment section of code to control it's behavior - \only<4>{ - It's error prone and impossible to automate analyses - } - - \item Provide a simple example or test data set - \only<4>{ - \begin{itemize} - \item it's easier for other to test it - \item it's easier for you to validate changes - \item you can test it in different environment - \end{itemize} - } - - \item Submit code to a reputable DOI-issuing repository - \only<4>{ - Figshare, Zenodo, \dots - } - \end{itemize} -\end{frame} - -\begin{frame} - \frametitle{Good Enough Practices in Scientific Computing} - \framesubtitle{Collaboration} - Make it easy for new collaborators to joint your project - \begin{itemize} - \item Create an overview of your project - \only<1>{ - \begin{itemize} - \item have a README at the root of your project - \item project title, discription, contact information - \item examples of how to run tasks - \item have a CONTRIBUTING file - \item point to visitors ways they can help - \item dependencies that needs to be installed - \item tests that can be run - \item guidelines that the project adheres to - \end{itemize} - } - - \item Create a shared public \emph{to-do} list - \only<2>{ - \begin{itemize} - \item have a todo.txt or \emph{issue} in GitLab - \item describe the items clearly so they make sence to newcomers - \end{itemize} - } - - \item Nake the license explicit - \only<3>{ - \begin{itemize} - \item have a LICENSE file - \item lack of an explicit license implies that the author is keeping all rights and others are not allowed to re-use or modify the material - \end{itemize} - } - - \item Make the project citable - \only<4>{ - \begin{itemize} - \item have a CITATON file - \item how to cite the project as a whole - \item where to find and how to cite any data sets, code, figures, \dots - \end{itemize} - } - \end{itemize} -\end{frame} - -\begin{frame} - \frametitle{Good Enough Practices in Scientific Computing} - \framesubtitle{Project Organistation} - \begin{itemize} - \item Put each project in its own directory named after the proejct - \only<1>{ - \begin{itemize} - \item help you and others to best understand your work - \item divide projects based on the oeverlap in data and code file - \end{itemize} - } - - \item Put text docuemtns associated with the project in the \texttt{doc} - directory - \only<2>{ - \begin{itemize} - \item file for manuscripts - \item documentation for source code - \item electronic lab notbook recording your experiments - \end{itemize} - } - - \item Put raw data and metadata in the \texttt{data} directory - \only<3>{ - \begin{itemize} - \item put examples in the \texttt{examples} subdirectory - \item name file with a timestamp - \end{itemize} - } - - \item Put project source code in the \texttt{src} directory - \only<4>{ - \begin{itemize} - \item code written in interpred langages such as R or Python - \item code written in compiled languages like C++ or Java - \item shell scripts - \item snippets of SQL - \item other code needed to regenerate the results - \end{itemize} - } - \only<5>{ - This directory may contain two conceptually distrinct types of files - \begin{itemize} - \item files that perform the core analysis - \item controller or drivers scripts that combine the core analytical - functions with particular parameters in order to execute the entire - project analysis from start to finish - \end{itemize} - } - - \item Put file generated during cleanup and analysis in the \texttt{results} - directory - \only<6>{ - \begin{itemize} - \item cleaned data - \item simulated data - \item final results (figures and tables) - \item files belonging to different papers should be grouped together - \end{itemize} - This directory can be filed with the content of the \texttt{src} directory - } - - \item Put external scripts or compiled programs in the \texttt{bin} directory - \only<7>{ - this directory can be rebuild from the README and CONTRIBUTING file - } - - \item Name all files to reflect their content or function - \only<8>{ - } - \end{itemize} -\end{frame} - -\begin{frame} - \frametitle{Good Enough Practices in Scientific Computing} - \framesubtitle{Keeping Track of Changes} - Use a version control system like Git - \begin{itemize} - \item Back up (almost) everything created by a human being - \only<1>{ - \begin{itemize} - \item do it as soon as it is created - \item everythings in the folders \texttt{src} and \texttt{documentation} - \end{itemize} - } - - \item Keep changes small - \only<2>{ - \begin{itemize} - \item not too big that the differences between components cannot be inverstigated speraratly - \item not too short for the same reason - \item single changes should be possible to undo in one step at some point in the future - \end{itemize} - } - - \item Share changes frequently - \only<3>{ - \begin{itemize} - \item everyone working on the proejct shoudl share and incorporate changes on a regular basis. - \item dont allow different versions of the project to drift apart - \end{itemize} - } - - \item Create maintain and use a checklist for saving and sharing changes to the project - \only<4>{ - \begin{itemize} - \item writing log message that clearly explain any changes - \item style guidelines for code - \item updating to-do list - \item bans on committing half-done work or broken code - \end{itemize} - } - \only<5>{ - This directory may contain two conceptually distrinct types of files - \begin{itemize} - \item files that perform the core analysis - \item controller or drivers scripts that combine the core analytical - functions with particular parameters in order to execute the entire - project analysis from start to finish - \end{itemize} - } - - \item Put file generated during cleanup and analysis in the \texttt{results} - directory - \only<6>{ - \begin{itemize} - \item cleaned data - \item simulated data - \item final results (figures and tables) - \item files belonging to different papers should be grouped together - \end{itemize} - This directory can be filed with the content of the \texttt{src} directory - } - - \item Put external scripts or compiled programs in the \texttt{bin} directory - \only<7>{ - this directory can be rebuild from the README and CONTRIBUTING file - } - - \item Name all files to reflect their content or function - \only<8>{ - } - \end{itemize} -\end{frame} - -\begin{frame} - \frametitle{Semantic Versioning 2.0.0 (http://semver.org/)} - Given a version number {\bf major.minor.patch}, increment the: - \begin{itemize} - \item {\bf major} version when you make incompatible API changes, - \item {\bf minor} version when you add functionality in a backwards-compatible manner, and - \item {\bf patch} version when you make backwards-compatible bug fixes. - \end{itemize} -\end{frame} - -\begin{frame} - \frametitle{10 Simple Rules for the Open Development of Scientific Software} - \only<1>{ - \begin{block}{Rule 1: Don’t Reinvent the Wheel} - Many fundamental scientific algorithms and methods have already been - implemented in open-source libraries.\\ - If it is useful it can benefit everyone, even if it addresses a mundane - task.\\ - When there are no existing implementations for your platform, or they - cannot cope with the size, complexity, or other specifics of your data, - then new approaches may be required that lead to new science. - \end{block} - } - \only<2>{ - \begin{block}{Rule 2: Code Well} - Learn the basics of software development.\\ - Study other people’s code and learn by practice.\\ - Join an existing open-source project. - \end{block} - } - \only<3>{ - \begin{block}{Rule 3: Be Your Own User} - ``eat your own dog food''\\ - Your software should be useful to other developers, is not simply a - demonstration of the solution.\\ - How software is structured or functions in a variety of situations are - difficult to detect during peer review. - \end{block} - } - \only<4>{ - \begin{block}{Rule 4: Be Transparent} - People with similar or related research interests who discover the project - will find that they have more to gain from collaborating than from - competing with the original developers.\\ - Allows many eyes to evaluate the code and recognize and fix any issues, - which reduces the likelihood of serious errors in the final product. - \end{block} - } - \only<5>{ - \begin{block}{Rule 5: Be SimpleRule 5: Be Simple} - Science is hard enough already.\\ - Documentation helps a lot, but simplicity is key.\\ - Employ standard package or software installation models.\\ - Support standard file formats and don’t come up with new, custom formats. - \end{block} - } - \only<6>{ - \begin{block}{Rule 6: Don’t Be a Perfectionist} - ``Release early, release often''\\ - ``customers'' will quickly identify problems and new requirements, and you - will be able to fix them more quickly if you avoid sitting on and - polishing new code for several months. - \end{block} - } - \only<7>{ - \begin{block}{Rule 7: Nurture and Grow Your Community} - Make it easy for others to contribute ideas and act on feedback.\\ - avoid changing key aspects of your code that other people’s software or - analysis pipelines might de- pend on, such as file formats, command line - arguments, or API.\\ - http://semver.org - \end{block} - } - \only<8>{ - \begin{block}{Rule 8: Promote Your Project} - A clean, well-organized website.\\ - Come up with a name and logo.\\ - Create personae for your project on social networks.\\ - Go to conferences and give as many presentations as you can. - \end{block} - } - \only<9>{ - \begin{block}{Rule 9: Find Sponsors} - Some level of funding is essential.\\ - It can be easyer, if the previous rules are followed. - \end{block} - } - \only<10>{ - \begin{block}{Rule 10: Science Counts} - maintenance of code that is no longer relevant to your own research is a - serious time sink.\\ - Open-source communities ensure persistence of projects by allowing project - leadership to be shared and passed to other members. - \end{block} - } -\end{frame} - -\begin{frame} - \frametitle{10 Simple Rules for Reproducible Computational Research} - \framesubtitle{good habits of reproducibility may actually turn out to be - a time-saver in the longer run.} - - \only<1>{ - \begin{block}{Rule 1: For Every Result, Keep Track of How It Was Produced} - Record every steps necessary to produce the results.\\ - Name, version and parameters of the pograms. - \end{block} - } - \only<2>{ - \begin{block}{Rule 2: Avoid Manual Data Manipulation Steps} - As a minimum note down which data files were modified or moved, - and for what purpose. - \end{block} - } - \only<3>{ - \begin{block}{Rule 3: Archive the Exact Versions of All External Programs - Used} - The only viable solution may then be to store a full virtual - machine image of the OS.\\ - As a minimum, note the exact names and versions of the programs. - \end{block} - } - \only<4>{ - \begin{block}{Rule 4: Version Control All Custom Scripts} - If computer code is not systematically archived along its evolution, - backtracking to a code state that gave a certain result may be a hopeless - task.\\ - Use a version control system. - \end{block} - } - \only<5>{ - \begin{block}{Rule 5: Record All Intermediate Results, When Possible in - Standardized Formats} - Reveal discrepancies toward what is assumed that are not apparent in the - final results.\\ - Reveals consequences of alternative programs and parameter choices at - individual steps.\\ - Allows parts of the process to be rerun.\\ - Experienced inconsistencies can be tracked to the steps where the - problems arise.\\ - \end{block} - } - \only<6>{ - \begin{block}{Rule 6: For Analyses That Include Randomness, Note Underlying - Random Seeds} - There is a large difference between observing that a result has been reproduced exactly or only approximately.\\ - This allows results to be reproduced exactly. - \end{block} - } - \only<7>{ - \begin{block}{Rule 7: Always Store Raw Data behind Plots} - Store the code used to make the plot.\\ - One can simply modify the plotting procedure, instead of having to redo - the whole analysis.\\ - If one really wants to read fine values in a figure, one can consult the - raw numbers. - \end{block} - } - \only<8>{ - \begin{block}{Rule 8: Generate Hierarchical Analysis Output, Allowing Layers - of Increasing Detail to Be Inspected} - When the storage context allows, it is better to simply incorporate - permanent output of all underlying data when a main result is generated, - using a systematic naming convention to allow the full data underlying a - given summarized value to be easily found. - \end{block} - } - \only<9>{ - \begin{block}{Rule 9: Connect Textual Statements to Underlying Results} - If you want to reevaluate your previ- ous interpretations, or allow peers - to make their own assessment of claims you make in a scientific paper, you - will have to connect a given textual statement (interpretation, claim, - conclusion) to the precise results underlying the statement.\\ - Such a connection can for instance be a simple file path to detailed - results, or the ID of a result in an analysis framework, included within - the text itself. - \end{block} - } - \only<10{ - \begin{block}{Rule 10: Provide Public Access to Scripts, Runs, and Results} - All input data, scripts, versions, parameters, and inter- mediate results - should be made publicly and easily accessible.\\ - Making reproducibility of your work by peers a realistic possibility sends - a strong signal of quality, trustworthiness, and transparency. This could - increase the quality and speed of the reviewing process on your work, the - chances of your work getting published, and the chances of your work being - taken further and cited by other researchers after publication - \end{block} - } - -\end{frame} - -\begin{frame} - \frametitle{10 simple rules to enable multi-site collaboration through data - sharing} - \begin{enumerate} - \item Rule 1: Make Software Open-Source - collaborators must have access to code in a shared repository. - \item Rule 2: Provide Open-Source Data - in addition to raw data files, it is also helpful to provide intermediate - file at various stages of processing. - \item Rule 3 Use Multiple Platforms to Share Research Products - this will increase the number of users that will find your data and results - interesting. - \item Rule 4: Secure Necessary Permissions/Data Use Agreements A Priori - before publishing, or providing data in any type of platform it is important - to secure all necessary provisions and data use agreements. - \item Rule 5: Know the Privacy Rules for Your Data - anonymize patient information while allowing patient-level data sharing. - \item Rule 6: Facilitate Reproducibility - see the 10 simple rules. - \item Rule 7: Think Global - hamonization of data, accessible documentation. - \item Rule 8: Publicize Your Work - publish data and analysis / algorithm in different journals. - \item Rule 9: Stay Realistic, but Aim High - realistic, but do not be afraid to challenge the status quo. - \item Rule 10: Be Engaged - release early, release often. Communicate often. Care. - \end{enumerate} -\end{frame} - -\end{document} -- GitLab