cleanup presentation.tex draft

67523501 · Laurent Modolo · b735469e · b735469e
Unverified Commit 67523501 authored 5 years ago by Laurent Modolo
--- a/presentation.tex
+++ b/presentation.tex
-\documentclass{beamer}
-
-\begin{document}
-
-\begin{frame}
-  \frametitle{Good Enough Practices in Scientific Computing}
-  \framesubtitle{Data Management}
-  \begin{itemize}
-    \item Save the raw data
-    \only<1>{
-      \begin{itemize}
-        \item re-running analysis from start to finish
-        \item recovery from mishaps
-        \item exprerimenting without fear
-      \end{itemize}
-    }
-
-    \item Create data you wish to see in the world
-    \only<2>{
-      \begin{itemize}
-        \item improve machine and human readability
-        \item open, non-proprietary formats ensure machine readability across
-        time and computing setups (CSV, JSON, \dots)
-        \item use self-explaining variable names (\texttt{untreated} vs \texttt{treated})
-        \item use \texttt{NA} for missing data
-        \item useful metadata can be used as part of the filename (date, location, \dots).
-      \end{itemize}
-    }
-
-    \item Create analysis-friendly data
-    \only<3>{
-      \begin{itemize}
-        \item Make each column a variable
-        \item Make each row an observation
-      \end{itemize}
-    }
-
-    \item Record all the steps used to precess data
-    \only<4>{
-      Otherwise it's impossible for you or anyone else to repeat the analysis
-      \begin{itemize}
-        \item Write scripts for every stage of data processing
-        \item When scripting is not feasible: document every manual action
-      \end{itemize}
-    }
-
-    \item Anticipate the need to use multiple tables
-    \only<4>{
-      Use subject id represented in a common format accross the table
-    }
-
-    \item Submit data to a reputable DOI-issuing repository so that other can
-    access and cite it.
-    \only<4>{
-      Your data is as much a product of your research as the papers you write.
-    }
-  \end{itemize}
-\end{frame}
-
-\begin{frame}
-  \frametitle{Good Enough Practices in Scientific Computing}
-  \framesubtitle{Software}
-  \begin{itemize}
-    \item Place a bried explanatory comment at the start of every proram
-    \only<1>{
-      It should include at least one example of how the program is used
-    }
-
-    \item Decompose programs into functions
-    \only<2>{
-      We are limited, break things into chunks that we can understand
-      \begin{itemize}
-        \item no more that 60 lines
-        \item do not use global variables
-        \item less than 7 parameters
-      \end{itemize}
-    }
-
-    \item Eliminate duplication
-    \only<3>{
-      \begin{itemize}
-        \item Write re-use functions instead of copying code
-        \item Use data structure instead of a lots of variables
-        \item Use existing and well-maintained software libraries
-        \item and test them!
-      \end{itemize}
-    }
-
-    \item Give functions and variables meaningful names
-    \only<4>{
-      \begin{itemize}
-        \item The reated it's scope, the more informative it's name should be
-        \item With tab completion, you have no escuses
-      \end{itemize}
-    }
-
-    \item Make dependencies and requirements explicit
-    \only<4>{
-      This is the ``Getting started'' setion of the README file
-    }
-
-    \item Do not comment / uncomment section of code to control it's behavior
-    \only<4>{
-      It's error prone and impossible to automate analyses
-    }
-
-    \item Provide a simple example or test data set
-    \only<4>{
-      \begin{itemize}
-        \item it's easier for other to test it
-        \item it's easier for you to validate changes
-        \item you can test it in different environment
-      \end{itemize}
-    }
-
-    \item Submit code to a reputable DOI-issuing repository
-    \only<4>{
-      Figshare, Zenodo, \dots
-    }
-  \end{itemize}
-\end{frame}
-
-\begin{frame}
-  \frametitle{Good Enough Practices in Scientific Computing}
-  \framesubtitle{Collaboration}
-  Make it easy for new collaborators to joint your project
-  \begin{itemize}
-    \item Create an overview of your project
-    \only<1>{
-      \begin{itemize}
-        \item have a README at the root of your project
-        \item project title, discription, contact information
-        \item examples of how to run tasks
-        \item have a CONTRIBUTING file
-        \item point to visitors ways they can help
-        \item dependencies that needs to be installed
-        \item tests that can be run
-        \item guidelines that the project adheres to
-      \end{itemize}
-    }
-
-    \item Create a shared public \emph{to-do} list
-    \only<2>{
-      \begin{itemize}
-        \item have a todo.txt or \emph{issue} in GitLab
-        \item describe the items clearly so they make sence to newcomers
-      \end{itemize}
-    }
-
-    \item Nake the license explicit
-    \only<3>{
-      \begin{itemize}
-        \item have a LICENSE file
-        \item lack of an explicit license implies that the author is keeping all rights and others are not allowed to re-use or modify the material
-      \end{itemize}
-    }
-
-    \item Make the project citable
-    \only<4>{
-      \begin{itemize}
-        \item have a CITATON file
-        \item how to cite the project as a whole
-        \item where to find and how to cite any data sets, code, figures, \dots
-      \end{itemize}
-    }
-  \end{itemize}
-\end{frame}
-
-\begin{frame}
-  \frametitle{Good Enough Practices in Scientific Computing}
-  \framesubtitle{Project Organistation}
-  \begin{itemize}
-    \item Put each project in its own directory named after the proejct
-    \only<1>{
-      \begin{itemize}
-        \item help you and others to best understand your work
-        \item divide projects based on the oeverlap in data and code file
-      \end{itemize}
-    }
-
-    \item Put text docuemtns associated with the project in the \texttt{doc}
-    directory
-    \only<2>{
-      \begin{itemize}
-        \item file for manuscripts
-        \item documentation for source code
-        \item electronic lab notbook recording your experiments
-      \end{itemize}
-    }
-
-    \item Put raw data and metadata in the \texttt{data} directory
-    \only<3>{
-      \begin{itemize}
-        \item put examples in the \texttt{examples} subdirectory
-        \item name file with a timestamp
-      \end{itemize}
-    }
-
-    \item Put project source code in the \texttt{src} directory
-    \only<4>{
-      \begin{itemize}
-        \item code written in interpred langages such as R or Python
-        \item code written in compiled languages like C++ or Java
-        \item shell scripts
-        \item snippets of SQL
-        \item other code needed to regenerate the results
-      \end{itemize}
-    }
-    \only<5>{
-      This directory may contain two conceptually distrinct types of files
-      \begin{itemize}
-        \item files that perform the core analysis
-        \item controller or drivers scripts that combine the core analytical
-        functions with particular parameters in order to execute the entire
-        project analysis from start to finish
-      \end{itemize}
-    }
-
-    \item Put file generated during cleanup and analysis in the \texttt{results}
-    directory
-    \only<6>{
-      \begin{itemize}
-        \item cleaned data
-        \item simulated data
-        \item final results (figures and tables)
-        \item files belonging to different papers should be grouped together
-      \end{itemize}
-      This directory can be filed with the content of the \texttt{src} directory
-    }
-
-    \item Put external scripts or compiled programs in the \texttt{bin} directory
-    \only<7>{
-      this directory can be rebuild from the README and CONTRIBUTING file
-    }
-
-    \item Name all files to reflect their content or function
-    \only<8>{
-    }
-  \end{itemize}
-\end{frame}
-
-\begin{frame}
-  \frametitle{Good Enough Practices in Scientific Computing}
-  \framesubtitle{Keeping Track of Changes}
-  Use a version control system like Git
-  \begin{itemize}
-    \item Back up (almost) everything created by a human being
-    \only<1>{
-      \begin{itemize}
-        \item do it as soon as it is created
-        \item everythings in the folders \texttt{src} and \texttt{documentation}
-      \end{itemize}
-    }
-
-    \item Keep changes small
-    \only<2>{
-      \begin{itemize}
-        \item not too big that the differences between components cannot be inverstigated speraratly
-        \item not too short for the same reason
-        \item single changes should be possible to undo in one step at some point in the future
-      \end{itemize}
-    }
-
-    \item Share changes frequently
-    \only<3>{
-      \begin{itemize}
-        \item everyone working on the proejct shoudl share and incorporate changes on a regular basis.
-        \item dont allow different versions of the project to drift apart
-      \end{itemize}
-    }
-
-    \item Create maintain and use a checklist for saving and sharing changes to the project
-    \only<4>{
-      \begin{itemize}
-        \item writing log message that clearly explain any changes
-        \item style guidelines for code
-        \item updating to-do list
-        \item bans on committing half-done work or broken code
-      \end{itemize}
-    }
-    \only<5>{
-      This directory may contain two conceptually distrinct types of files
-      \begin{itemize}
-        \item files that perform the core analysis
-        \item controller or drivers scripts that combine the core analytical
-        functions with particular parameters in order to execute the entire
-        project analysis from start to finish
-      \end{itemize}
-    }
-
-    \item Put file generated during cleanup and analysis in the \texttt{results}
-    directory
-    \only<6>{
-      \begin{itemize}
-        \item cleaned data
-        \item simulated data
-        \item final results (figures and tables)
-        \item files belonging to different papers should be grouped together
-      \end{itemize}
-      This directory can be filed with the content of the \texttt{src} directory
-    }
-
-    \item Put external scripts or compiled programs in the \texttt{bin} directory
-    \only<7>{
-      this directory can be rebuild from the README and CONTRIBUTING file
-    }
-
-    \item Name all files to reflect their content or function
-    \only<8>{
-    }
-  \end{itemize}
-\end{frame}
-
-\begin{frame}
-  \frametitle{Semantic Versioning 2.0.0 (http://semver.org/)}
-  Given a version number {\bf major.minor.patch}, increment the:
-  \begin{itemize}
-    \item {\bf major} version when you make incompatible API changes,
-    \item {\bf minor} version when you add functionality in a backwards-compatible manner, and
-    \item {\bf patch} version when you make backwards-compatible bug fixes.
-  \end{itemize}
-\end{frame}
-
-\begin{frame}
-  \frametitle{10 Simple Rules for the Open Development of Scientific Software}
-  \only<1>{
-    \begin{block}{Rule 1: Don’t Reinvent the Wheel}
-      Many fundamental scientific algorithms and methods have already been
-      implemented in open-source libraries.\\
-      If it is useful it can benefit everyone, even if it addresses a mundane
-      task.\\
-      When there are no existing implementations for your platform, or they
-      cannot cope with the size, complexity, or other specifics of your data,
-      then new approaches may be required that lead to new science.
-    \end{block}
-  }
-  \only<2>{
-    \begin{block}{Rule 2: Code Well}
-      Learn the basics of software development.\\
-      Study other people’s code and learn by practice.\\
-      Join an existing open-source project.
-    \end{block}
-  }
-  \only<3>{
-    \begin{block}{Rule 3: Be Your Own User}
-      ``eat your own dog food''\\
-      Your software should be useful to other developers, is not simply a
-      demonstration of the solution.\\
-      How software is structured or functions in a variety of situations are
-      difficult to detect during peer review.
-    \end{block}
-  }
-  \only<4>{
-    \begin{block}{Rule 4: Be Transparent}
-      People with similar or related research interests who discover the project
-      will find that they have more to gain from collaborating than from
-      competing with the original developers.\\
-      Allows many eyes to evaluate the code and recognize and fix any issues,
-      which reduces the likelihood of serious errors in the final product.
-    \end{block}
-  }
-  \only<5>{
-    \begin{block}{Rule 5: Be SimpleRule 5: Be Simple}
-      Science is hard enough already.\\
-      Documentation helps a lot, but simplicity is key.\\
-      Employ standard package or software installation models.\\
-      Support standard file formats and don’t come up with new, custom formats.
-    \end{block}
-  }
-  \only<6>{
-    \begin{block}{Rule 6: Don’t Be a Perfectionist}
-      ``Release early, release often''\\
-      ``customers'' will quickly identify problems and new requirements, and you
-      will be able to fix them more quickly if you avoid sitting on and
-      polishing new code for several months.
-    \end{block}
-  }
-  \only<7>{
-    \begin{block}{Rule 7: Nurture and Grow Your Community}
-      Make it easy for others to contribute ideas and act on feedback.\\
-      avoid changing key aspects of your code that other people’s software or
-      analysis pipelines might de- pend on, such as file formats, command line
-      arguments, or API.\\
-      http://semver.org
-    \end{block}
-  }
-  \only<8>{
-    \begin{block}{Rule 8: Promote Your Project}
-      A clean, well-organized website.\\
-      Come up with a name and logo.\\
-      Create personae for your project on social networks.\\
-      Go to conferences and give as many presentations as you can.
-    \end{block}
-  }
-  \only<9>{
-    \begin{block}{Rule 9: Find Sponsors}
-      Some level of funding is essential.\\
-      It can be easyer, if the previous rules are followed.
-    \end{block}
-  }
-  \only<10>{
-    \begin{block}{Rule 10: Science Counts}
-      maintenance of code that is no longer relevant to your own research is a
-      serious time sink.\\
-      Open-source communities ensure persistence of projects by allowing project
-      leadership to be shared and passed to other members.
-    \end{block}
-  }
-\end{frame}
-
-\begin{frame}
-  \frametitle{10 Simple Rules for Reproducible Computational Research}
-  \framesubtitle{good habits of reproducibility may actually turn out to be
-  a time-saver in the longer run.}
-
-  \only<1>{
-    \begin{block}{Rule 1: For Every Result, Keep Track of How It Was Produced}
-      Record every steps necessary to produce the results.\\
-      Name, version and parameters of the pograms.
-    \end{block}
-  }
-  \only<2>{
-    \begin{block}{Rule 2: Avoid Manual Data Manipulation Steps}
-      As a minimum note down which data files were modified or moved,
-      and for what purpose.
-    \end{block}
-  }
-  \only<3>{
-    \begin{block}{Rule 3: Archive the Exact Versions of All External Programs
-      Used}
-      The only viable solution may then be to store a full virtual
-      machine image of the OS.\\
-      As a minimum, note the exact names and versions of the programs.
-    \end{block}
-  }
-  \only<4>{
-    \begin{block}{Rule 4: Version Control All Custom Scripts}
-      If computer code is not systematically archived along its evolution,
-      backtracking to a code state that gave a certain result may be a hopeless
-      task.\\
-      Use a version control system.
-    \end{block}
-  }
-  \only<5>{
-    \begin{block}{Rule 5: Record All Intermediate Results, When Possible in
-      Standardized Formats}
-      Reveal discrepancies toward what is assumed that are not apparent in the
-      final results.\\
-      Reveals consequences of alternative programs and parameter choices at
-      individual steps.\\
-      Allows parts of the process to be rerun.\\
-      Experienced inconsistencies can be tracked to the steps where the
-      problems arise.\\
-    \end{block}
-  }
-  \only<6>{
-    \begin{block}{Rule 6: For Analyses That Include Randomness, Note Underlying
-      Random Seeds}
-      There is a large difference between observing that a result has been reproduced exactly or only approximately.\\
-      This allows results to be reproduced exactly.
-    \end{block}
-  }
-  \only<7>{
-    \begin{block}{Rule 7: Always Store Raw Data behind Plots}
-      Store the code used to make the plot.\\
-      One can simply modify the plotting procedure, instead of having to redo
-      the whole analysis.\\
-      If one really wants to read fine values in a figure, one can consult the
-      raw numbers.
-    \end{block}
-  }
-  \only<8>{
-    \begin{block}{Rule 8: Generate Hierarchical Analysis Output, Allowing Layers
-      of Increasing Detail to Be Inspected}
-      When the storage context allows, it is better to simply incorporate
-      permanent output of all underlying data when a main result is generated,
-      using a systematic naming convention to allow the full data underlying a
-      given summarized value to be easily found.
-    \end{block}
-  }
-  \only<9>{
-    \begin{block}{Rule 9: Connect Textual Statements to Underlying Results}
-      If you want to reevaluate your previ- ous interpretations, or allow peers
-      to make their own assessment of claims you make in a scientific paper, you
-      will have to connect a given textual statement (interpretation, claim,
-      conclusion) to the precise results underlying the statement.\\
-      Such a connection can for instance be a simple file path to detailed
-      results, or the ID of a result in an analysis framework, included within
-      the text itself.
-    \end{block}
-  }
-  \only<10{
-    \begin{block}{Rule 10: Provide Public Access to Scripts, Runs, and Results}
-      All input data, scripts, versions, parameters, and inter- mediate results
-      should be made publicly and easily accessible.\\
-      Making reproducibility of your work by peers a realistic possibility sends
-      a strong signal of quality, trustworthiness, and transparency. This could
-      increase the quality and speed of the reviewing process on your work, the
-      chances of your work getting published, and the chances of your work being
-      taken further and cited by other researchers after publication
-    \end{block}
-  }
-
-\end{frame}
-
-\begin{frame}
-  \frametitle{10 simple rules to enable multi-site collaboration through data
-  sharing}
-  \begin{enumerate}
-    \item Rule 1: Make Software Open-Source
-    collaborators must have access to code in a shared repository.
-    \item Rule 2: Provide Open-Source Data
-    in addition to raw data files, it is also helpful to provide intermediate
-    file at various stages of processing.
-    \item Rule 3 Use Multiple Platforms to Share Research Products
-    this will increase the number of users that will find your data and results
-    interesting.
-    \item Rule 4: Secure Necessary Permissions/Data Use Agreements A Priori
-    before publishing, or providing data in any type of platform it is important
-    to secure all necessary provisions and data use agreements.
-    \item Rule 5: Know the Privacy Rules for Your Data
-    anonymize patient information while allowing patient-level data sharing.
-    \item Rule 6: Facilitate Reproducibility
-    see the 10 simple rules.
-    \item Rule 7: Think Global
-    hamonization of data, accessible documentation.
-    \item Rule 8: Publicize Your Work
-    publish data and analysis / algorithm in different journals.
-    \item Rule 9: Stay Realistic, but Aim High
-    realistic, but do not be afraid to challenge the status quo.
-    \item Rule 10: Be Engaged
-    release early, release often. Communicate often. Care.
-  \end{enumerate}
-\end{frame}
-
-\end{document}