{"slug": "dspinellis--awesome-msr", "title": "Msr", "description": "A curated repository of software engineering repository mining data sets", "github_url": "https://github.com/dspinellis/awesome-msr", "stars": "455", "tag": "Computer Science", "entry_count": 61, "subcategory_count": 1, "subcategories": [{"name": "General", "parent": "", "entries": [{"name": "Repositories", "url": "#repositories", "description": ""}, {"name": "Data Sets", "url": "#data-sets", "description": ""}, {"name": "Tools", "url": "#tools", "description": ""}, {"name": "Research Outlets", "url": "#research-outlets", "description": ""}, {"name": "ESEUR", "url": "https://github.com/Derek-Jones/ESEUR-code-data", "description": "", "stars": "418"}, {"name": "Directory of MSR Datasets", "url": "https://authecesofteng.github.io/directory-msr-datasets/", "description": ""}, {"name": "FLOSSmole", "url": "https://flossmole.org/collection_details", "description": "Collaborative collection and analysis of free/libre/open source project data."}, {"name": "PROMISE", "url": "http://promise.site.uottawa.ca/SERepository/datasets-page.html", "description": "About 20 datasets related to software engineering research."}, {"name": "SIR", "url": "http://sir.unl.edu/portal/index.php", "description": "Software-artifact infrastructure repository; Java, C, C++, and C# software together with test suites and fault data."}, {"name": "Zenodo", "url": "http://zenodo.org/", "description": "Software data collections in CERN's open-access repository."}, {"name": "AndroidTimeMachine", "url": "https://androidtimemachine.github.io", "description": "Graph-based dataset of commit history of 8,431 real-world Android apps."}, {"name": "AndroZoo", "url": "https://androzoo.uni.lu/", "description": "Collection of Android Applications."}, {"name": "Bug Prediction Dataset", "url": "http://bug.inf.usi.ch/index.php", "description": "Collection of models and metrics from Eclipse JDT Core, PDE UI, Equinox Framework, Lucene, Mylyn, and their histories."}, {"name": "Code Reviews", "url": "http://kin-y.github.io/miningReviewRepo/", "description": "Code reviews of OpenStack, LibreOffice, AOSP, Qt, Eclipse."}, {"name": "CoREBench", "url": "http://www.comp.nus.edu.sg/%7Erelease/corebench/", "description": "Collection of 70 realistically Complex Regression Errors that were systematically extracted from the repositories and bug reports of four open-source software projects: Make, Grep, Findutils, and Coreutils."}, {"name": "Cryptocurrency GitHub Activity and Market Cap Dataset", "url": "https://rvantonder.github.io/CryptOSS/", "description": "Activity such as commits, stars, prices, and market cap of over 200 cryptocurrency projects on GitHub over time. Raw, historic data is also [available](https://zenodo.org/record/2595588#.XRuzuBNKhSM)."}, {"name": "Defects4J", "url": "https://github.com/rjust/defects4j", "description": "Collection of 395 reproducible bugs collected with the goal of advancing software testing research.", "stars": "889"}, {"name": "Eclipse AERI stacktraces", "url": "http://download.eclipse.org/scava/datasets/aeri_stacktraces/aeri_stacktraces.html", "description": "Collection of stacktraces of Exceptions encountered by users of the Eclipse IDE, as retrieved by the AERI reporting system."}, {"name": "Enron Spreadsheets and Emails", "url": "https://figshare.com/articles/Enron_Spreadsheets_and_Emails/1221767", "description": "All the spreadsheets and emails used in the paper 'Enron's Spreadsheets and Related Emails: A Dataset and Analysis'."}, {"name": "Findbugs-maven", "url": "https://github.com/istlab/maven_bug_catalog", "description": "Set of FindBugs reports for the Java projects of the [Maven repository](https://maven.apache.org).", "stars": "2"}, {"name": "GHTorrent", "url": "http://ghtorrent.org/", "description": "Scalable, queriable, offline mirror of data offered through the GitHub REST API."}, {"name": "GitHub Bug Dataset", "url": "http://www.inf.u-szeged.hu/~ferenc/papers/GitHubBugDataSet/", "description": "Bug Dataset of 15 Java open-source projects characterized by static source code metrics."}, {"name": "GitHub on Google BigQuery", "url": "https://cloud.google.com/bigquery/public-data/github", "description": "GitHub data accessible through Google's BigQuery platform."}, {"name": "Grammar Zoo", "url": "http://slebok.github.io/zoo/", "description": "Collection of grammars of DSLs and GPLs, some extracted from metamodels and document schemata."}, {"name": "KaVE", "url": "http://www.kave.cc/datasets", "description": "Developer tool interaction data."}, {"name": "Linux Kernel 4.21 Call Graphs", "url": "https://zenodo.org/record/2652487#.XRnvomUzb0o", "description": "The Linux Kernel 4.21 Call Graphs produced using [CScout (\u2b50213)](https://github.com/dspinellis/cscout/)."}, {"name": "Maven metrics", "url": "https://github.com/bkarak/data_msr2015", "description": "Collection of software complexity & sizing metrics for the [Maven Repository](https://maven.apache.org).", "stars": "0"}, {"name": "Maven Dependency Graph", "url": "https://zenodo.org/record/1489120", "description": "Snapshot of the whole Maven Central taken on September 6, 2018, stored in a graph database."}, {"name": "mzdata", "url": "https://github.com/jxshin/mzdata", "description": "Multi-extract and multi-level dataset of Mozilla issue tracking history.", "stars": "7"}, {"name": "npm-miner", "url": "https://github.com/AuthEceSoftEng/msr-2018-npm-miner", "description": "The dataset contains the analysis results of 5 open source software quality tools eslint, escomplex, nsp, jsinspect and sonarjs for 2000 popular (in terms of stars and downloads) npm packages.", "stars": "1"}, {"name": "OCL Expressions on GitHub", "url": "https://github.com/tue-mdse/ocl-dataset", "description": "Data set of 9188 OCL expressions originating from 504 EMF meta-models in 245 systematically selected GitHub repositories.", "stars": "6"}, {"name": "RepoReapers Data Set", "url": "https://reporeapers.github.io", "description": "Data set containing a collection of *engineered software projects* from GHTorrent."}, {"name": "Software Heritage Graph Dataset", "url": "https://doi.org/10.5281/zenodo.2583978", "description": "Graph of the development history and file metadata of >80 million software projects from various forges (GitHub, Gitlab, Debian, PyPI, Google Code, etc) in a deduplicated and unified representation ([paper here](https://dl.acm.org/citation.cfm?id=3341907))."}, {"name": "STAMINA", "url": "http://stamina.chefbe.net/download", "description": "(STAte Machine INference Approaches) data are used to benchmark techniques for learning deterministic finite state machines (FSMs)."}, {"name": "Stack Exchange", "url": "https://archive.org/details/stackexchange", "description": "Anonymized dump of all user-contributed content on the Stack Exchange network."}, {"name": "SWE-bench", "url": "https://www.swebench.com", "description": "SWE-bench is a benchmark designed to evaluate the ability of AI models to solve real-world software engineering problems by generating fixes for issues found in open-source code repositories."}, {"name": "TravisTorrent", "url": "http://travistorrent.testroots.org", "description": "Provides free and easy-to-use Traivs CI build analyses."}, {"name": "Ultimate Debian Database (UDD)", "url": "https://wiki.debian.org/UltimateDebianDatabase", "description": "Data about various aspects of Debian (e.g. packages, bugs, mainteners) in the same SQL database."}, {"name": "Unified Bug Dataset", "url": "http://www.inf.u-szeged.hu/~ferenc/papers/UnifiedBugDataSet/", "description": "Static source code based datasets which includes the Bugcatchers Bug Dataset, the [Bug Prediction Dataset](http://bug.inf.usi.ch/index.php), the [Eclipse Bug Dataset](https://www.st.cs.uni-saarland.de/softevo/bug-data/eclipse/), the [GitHub Bug Dataset](http://www.inf.u-szeged.hu/~ferenc/papers/GitHubBugDataSet/), some datasets from the [PROMISE](http://promise.site.uottawa.ca/SERepository/datasets-page.html) repository."}, {"name": "Unix history", "url": "https://github.com/dspinellis/unix-history-repo", "description": "Git repository with 46 years of Unix history evolution.", "stars": "6.9k"}, {"name": "astminer", "url": "https://github.com/JetBrains-Research/astminer", "description": "Library and tool for mining of path-based representations of code and other data derived from ASTs.", "stars": "298"}, {"name": "Boa", "url": "http://boa.cs.iastate.edu/", "description": "Domain-specific language and infrastructure that eases mining software repositories."}, {"name": "buckwheat", "url": "https://github.com/JetBrains-Research/buckwheat", "description": "Multi-language tokenizer for extracting identifiers from source code.", "stars": "24"}, {"name": "ckjm", "url": "http://www.spinellis.gr/sw/ckjm/", "description": "Chidamber and Kemerer Java Metrics."}, {"name": "Coming", "url": "https://github.com/SpoonLabs/coming/", "description": "A Java framework for analyzing code changes and mining instances of change patterns from Git repositories.", "stars": "98"}, {"name": "CryptOSS", "url": "https://github.com/rvantonder/CryptOSS", "description": "Mine GitHub activity and market cap data for cryptocurrency projects.", "stars": "7"}, {"name": "DbDeo", "url": "https://github.com/tushartushar/DbDeo", "description": "Extract embedded SQL statements and detect database schema smells.", "stars": "13"}, {"name": "Designite", "url": "http://www.designite-tools.com", "description": "Compute source code metrics and detect a variety of implementation, design, and architecture smells for C#."}, {"name": "DesigniteJava", "url": "https://github.com/tushartushar/DesigniteJava", "description": "Compute source code metrics and detect a variety of implementation and design smells for Java.", "stars": "187"}, {"name": "Diggit", "url": "https://github.com/jrfaller/diggit", "description": "Agile Ruby Tool to analyze Git repositories.", "stars": "20"}, {"name": "GitEvo", "url": "https://github.com/andrehora/gitevo", "description": "Code evolution analysis for Git repositories.", "stars": "12"}, {"name": "GrimoireLab", "url": "http://grimoirelab.github.io/", "description": "Free/Libre/Open Source tools for Software Development Analytics."}, {"name": "MetricMiner", "url": "http://www.github.com/mauricioaniche/metricminer2", "description": "Lean Java DSL to mine and extract data (e.g. commits, developers, modifications, diffs) from Git and SVN repositories."}, {"name": "Maven-miner", "url": "https://github.com/diverse-project/maven-miner", "description": "Java tools and infrastructure to resolve the whole Maven dependency graph, hosted in Maven Central, in the form of a [Neo4j](https://neo4j.com/) Graph.", "stars": "32"}, {"name": "Perceval", "url": "https://github.com/chaoss/grimoirelab-perceval", "description": "Fetch repository data from tens of back-ends.", "stars": "308"}, {"name": "Puppeteer", "url": "https://github.com/tushartushar/Puppeteer", "description": "Detect configuration smells in Puppet code.", "stars": "40"}, {"name": "PyDriller", "url": "https://github.com/ishepard/pydriller", "description": "Python Framework to analyse Git repositories.", "stars": "915"}, {"name": "qmcalc", "url": "https://github.com/dspinellis/cqmetrics", "description": "Calculate quality metrics from C source code.", "stars": "66"}, {"name": "reaper", "url": "https://github.com/RepoReapers/reaper", "description": "Python tool to compute a score for a repository from GHTorrent. The score quantifies the extent to which the project contained within the repository is *engineered*.", "stars": "112"}, {"name": "RefactoringMiner", "url": "https://github.com/tsantalis/RefactoringMiner", "description": "Library/API for detection of refactorings in changes of Java code.", "stars": "458"}, {"name": "VulData7", "url": "https://github.com/electricalwind/data7", "description": "Java framework enabling the automated collection of commits fixing vulnerabilities that are reported in NVD (links NVD with Git).", "stars": "43"}]}], "name": ""}