{"slug": "awesome-spark--awesome-spark", "title": "Spark", "description": "A curated list of awesome Apache Spark packages and resources.", "github_url": "https://github.com/awesome-spark/awesome-spark", "stars": "1.7K", "tag": "Big Data", "entry_count": 78, "subcategory_count": 26, "subcategories": [{"name": "Language Bindings", "parent": "Packages", "entries": [{"name": "Kotlin for Apache Spark", "url": "https://github.com/Kotlin/kotlin-spark-api", "description": "", "stars": "459"}, {"name": ".NET for Apache Spark", "url": "https://github.com/dotnet/spark", "description": "", "stars": "2k"}, {"name": "sparklyr", "url": "https://github.com/rstudio/sparklyr", "description": "", "stars": "952"}, {"name": "sparkle", "url": "https://github.com/tweag/sparkle", "description": "", "stars": "447"}, {"name": "spark-connect-rs", "url": "https://github.com/sjrusso8/spark-connect-rs", "description": "", "stars": "85"}, {"name": "spark-connect-go", "url": "https://github.com/apache/spark-connect-go", "description": "", "stars": "155"}, {"name": "spark-connect-csharp", "url": "https://github.com/mdrakiburrahman/spark-connect-csharp", "description": "", "stars": "1"}]}, {"name": "Notebooks and IDEs", "parent": "Packages", "entries": [{"name": "almond", "url": "https://almond.sh/", "description": ""}, {"name": "Apache Zeppelin", "url": "https://zeppelin.incubator.apache.org/", "description": ""}, {"name": "Polynote", "url": "https://polynote.org/", "description": ""}, {"name": "sparkmagic", "url": "https://github.com/jupyter-incubator/sparkmagic", "description": "", "stars": "1.3k"}]}, {"name": "General Purpose Libraries", "parent": "Packages", "entries": [{"name": "itachi", "url": "https://github.com/yaooqinn/itachi", "description": "", "stars": "56"}, {"name": "spark-daria", "url": "https://github.com/mrpowers-io/spark-daria", "description": "", "stars": "751"}, {"name": "quinn", "url": "https://github.com/mrpowers-io/quinn", "description": "", "stars": "632"}, {"name": "Apache DataFu", "url": "https://github.com/apache/datafu/tree/master/datafu-spark", "description": "", "stars": "115"}, {"name": "Joblib Apache Spark Backend", "url": "https://github.com/joblib/joblib-spark", "description": "", "stars": "241"}]}, {"name": "SQL Data Sources", "parent": "Packages", "entries": [{"name": "Spark XML", "url": "https://github.com/databricks/spark-xml", "description": "", "stars": "504"}, {"name": "Spark Cassandra Connector", "url": "https://github.com/datastax/spark-cassandra-connector", "description": "", "stars": "1.9k"}, {"name": "Mongo-Spark", "url": "https://github.com/mongodb/mongo-spark", "description": "", "stars": "710"}]}, {"name": "Storage", "parent": "Packages", "entries": [{"name": "Delta Lake", "url": "https://github.com/delta-io/delta", "description": "", "stars": "7.5k"}, {"name": "Apache Hudi", "url": "https://github.com/apache/hudi", "description": "", "stars": "5.4k"}, {"name": "Apache Iceberg", "url": "https://github.com/apache/iceberg", "description": "", "stars": "6.4k"}, {"name": "lakeFS", "url": "https://docs.lakefs.io/integrations/spark.html", "description": ""}]}, {"name": "Bioinformatics", "parent": "Packages", "entries": [{"name": "ADAM", "url": "https://github.com/bigdatagenomics/adam", "description": "", "stars": "1k"}, {"name": "Hail", "url": "https://github.com/hail-is/hail", "description": "", "stars": "976"}]}, {"name": "GIS", "parent": "Packages", "entries": [{"name": "Apache Sedona", "url": "https://github.com/apache/incubator-sedona", "description": "", "stars": "2k"}]}, {"name": "Graph Processing", "parent": "Packages", "entries": [{"name": "GraphFrames", "url": "https://github.com/graphframes/graphframes", "description": "", "stars": "997"}, {"name": "neo4j-spark-connector", "url": "https://github.com/neo4j-contrib/neo4j-spark-connector", "description": "", "stars": "313"}]}, {"name": "Machine Learning Extension", "parent": "Packages", "entries": [{"name": "Apache SystemML", "url": "https://systemml.apache.org/", "description": ""}, {"name": "Mahout Spark Bindings", "url": "https://mahout.apache.org/users/sparkbindings/home.html", "description": ""}, {"name": "KeystoneML", "url": "http://keystone-ml.org/", "description": "Type safe machine learning pipelines with RDDs."}, {"name": "JPMML-Spark", "url": "https://github.com/jpmml/jpmml-spark", "description": "", "stars": "94"}, {"name": "ModelDB", "url": "https://mitdbg.github.io/modeldb", "description": ""}, {"name": "Sparkling Water", "url": "https://github.com/h2oai/sparkling-water", "description": "", "stars": "965"}, {"name": "BigDL", "url": "https://github.com/intel-analytics/BigDL", "description": "", "stars": "6.6k"}, {"name": "MLeap", "url": "https://github.com/combust/mleap", "description": "", "stars": "1.5k"}, {"name": "Microsoft ML for Apache Spark", "url": "https://github.com/Azure/mmlspark", "description": "", "stars": "5.1k"}, {"name": "MLflow", "url": "https://mlflow.org/docs/latest/python_api/mlflow.spark.html#module-mlflow.spark", "description": ""}]}, {"name": "Middleware", "parent": "Packages", "entries": [{"name": "Livy", "url": "https://github.com/apache/incubator-livy", "description": "", "stars": "883"}, {"name": "spark-jobserver", "url": "https://github.com/spark-jobserver/spark-jobserver", "description": "", "stars": "2.8k"}, {"name": "Apache Toree", "url": "https://github.com/apache/incubator-toree", "description": "", "stars": "739"}, {"name": "Apache Kyuubi", "url": "https://github.com/apache/kyuubi", "description": "", "stars": "2.1k"}]}, {"name": "Monitoring", "parent": "Packages", "entries": [{"name": "Data Mechanics Delight", "url": "https://github.com/datamechanics/delight", "description": "", "stars": "342"}]}, {"name": "Utilities", "parent": "Packages", "entries": [{"name": "sparkly", "url": "https://github.com/Tubular/sparkly", "description": "", "stars": "60"}, {"name": "Flintrock", "url": "https://github.com/nchammas/flintrock", "description": "", "stars": "638"}, {"name": "Optimus", "url": "https://github.com/ironmussa/Optimus/", "description": "", "stars": "1.5k"}]}, {"name": "Natural Language Processing", "parent": "Packages", "entries": [{"name": "spark-nlp", "url": "https://github.com/JohnSnowLabs/spark-nlp", "description": "", "stars": "3.9k"}]}, {"name": "Streaming", "parent": "Packages", "entries": [{"name": "Apache Bahir", "url": "https://bahir.apache.org/", "description": ""}]}, {"name": "Interfaces", "parent": "Packages", "entries": [{"name": "Apache Beam", "url": "https://beam.apache.org/", "description": ""}, {"name": "Koalas", "url": "https://github.com/databricks/koalas", "description": "", "stars": "3.3k"}]}, {"name": "Data quality", "parent": "Packages", "entries": [{"name": "deequ", "url": "https://github.com/awslabs/deequ", "description": "", "stars": "3.3k"}, {"name": "python-deequ", "url": "https://github.com/awslabs/python-deequ", "description": "", "stars": "717"}]}, {"name": "Testing", "parent": "Packages", "entries": [{"name": "spark-testing-base", "url": "https://github.com/holdenk/spark-testing-base", "description": "", "stars": "1.5k"}, {"name": "spark-fast-tests", "url": "https://github.com/mrpowers-io/spark-fast-tests", "description": "", "stars": "432"}, {"name": "chispa", "url": "https://github.com/MrPowers/chispa", "description": "", "stars": "606"}]}, {"name": "Web Archives", "parent": "Packages", "entries": [{"name": "Archives Unleashed Toolkit", "url": "https://github.com/archivesunleashed/aut", "description": "", "stars": "137"}]}, {"name": "Workflow Management", "parent": "Packages", "entries": [{"name": "Cromwell", "url": "https://github.com/broadinstitute/cromwell#spark-backend", "description": "", "stars": "993"}]}, {"name": "Books", "parent": "Resources", "entries": [{"name": "Learning Spark, 2nd Edition", "url": "https://www.oreilly.com/library/view/learning-spark-2nd/9781492050032/", "description": "Introduction to Spark API with Spark 3.0 covered. Good source of knowledge about basic concepts."}, {"name": "Advanced Analytics with Spark", "url": "http://shop.oreilly.com/product/0636920035091.do", "description": "Useful collection of Spark processing patterns. Accompanying GitHub repository: [sryza/aas (\u2b501.5k)](https://github.com/sryza/aas)."}, {"name": "Mastering Apache Spark", "url": "https://jaceklaskowski.gitbooks.io/mastering-apache-spark/", "description": "Interesting compilation of notes by [Jacek Laskowski](https://github.com/jaceklaskowski). Focused on different aspects of Spark internals."}, {"name": "Spark in Action", "url": "https://www.manning.com/books/spark-in-action", "description": "New book in the Manning's \"in action\" family with +400 pages. Starts gently, step-by-step and covers large number of topics. Free excerpt on how to [setup Eclipse for Spark application development](http://freecontent.manning.com/how-to-start-developing-spark-applications-in-eclipse/) and how to bootstrap a new application using the provided Maven Archetype. You can find the accompanying GitHub repo [here (\u2b50273)](https://github.com/spark-in-action/first-edition)."}]}, {"name": "Papers", "parent": "Resources", "entries": [{"name": "Large-Scale Intelligent Microservices", "url": "https://arxiv.org/pdf/2009.08044.pdf", "description": "Microsoft paper that presents an Apache Spark-based micro-service orchestration framework that extends database operations to include web service primitives."}, {"name": "Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing", "url": "https://people.csail.mit.edu/matei/papers/2012/nsdi_spark.pdf", "description": "Paper introducing a core distributed memory abstraction."}, {"name": "Spark SQL: Relational Data Processing in Spark", "url": "https://amplab.cs.berkeley.edu/wp-content/uploads/2015/03/SparkSQLSigmod2015.pdf", "description": "Paper introducing relational underpinnings, code generation and Catalyst optimizer."}, {"name": "Structured Streaming: A Declarative API for Real-Time Applications in Apache Spark", "url": "https://cs.stanford.edu/~matei/papers/2018/sigmod_structured_streaming.pdf", "description": "Structured Streaming is a new high-level streaming API, it is a declarative API based on automatically incrementalizing a static relational query."}]}, {"name": "MOOCS", "parent": "Resources", "entries": [{"name": "Data Science and Engineering with Apache Spark (edX XSeries)", "url": "https://www.edx.org/xseries/data-science-engineering-apache-spark", "description": "Series of five courses ([Introduction to Apache Spark](https://www.edx.org/course/introduction-apache-spark-uc-berkeleyx-cs105x), [Distributed Machine Learning with Apache Spark](https://www.edx.org/course/distributed-machine-learning-apache-uc-berkeleyx-cs120x), [Big Data Analysis with Apache Spark](https://www.edx.org/course/big-data-analysis-apache-spark-uc-berkeleyx-cs110x), [Advanced Apache Spark for Data Science and Data Engineering](https://www.edx.org/course/advanced-apache-spark-data-science-data-uc-berkeleyx-cs115x), [Advanced Distributed Machine Learning with Apache Spark](https://www.edx.org/course/advanced-distributed-machine-learning-uc-berkeleyx-cs125x)) covering different aspects of software engineering and data science. Python oriented."}, {"name": "Big Data Analysis with Scala and Spark (Coursera)", "url": "https://www.coursera.org/learn/big-data-analysys", "description": "Scala oriented introductory course. Part of [Functional Programming in Scala Specialization](https://www.coursera.org/specializations/scala)."}]}, {"name": "Workshops", "parent": "Resources", "entries": [{"name": "AMP Camp", "url": "http://ampcamp.berkeley.edu", "description": "Periodical training event organized by the [UC Berkeley AMPLab](https://amplab.cs.berkeley.edu/). A source of useful exercise and recorded workshops covering different tools from the [Berkeley Data Analytics Stack](https://amplab.cs.berkeley.edu/software/)."}]}, {"name": "Projects Using Spark", "parent": "Resources", "entries": [{"name": "Oryx 2", "url": "https://github.com/OryxProject/oryx", "description": "[Lambda architecture](http://lambda-architecture.net/) platform built on Apache Spark and [Apache Kafka](http://kafka.apache.org/) with specialization for real-time large scale machine learning.", "stars": "1.8k"}, {"name": "Photon ML", "url": "https://github.com/linkedin/photon-ml", "description": "A machine learning library supporting classical Generalized Mixed Model and Generalized Additive Mixed Effect Model.", "stars": "793"}, {"name": "PredictionIO", "url": "https://prediction.io/", "description": "Machine Learning server for developers and data scientists to build and deploy predictive applications in a fraction of the time."}, {"name": "Crossdata", "url": "https://github.com/Stratio/Crossdata", "description": "Data integration platform with extended DataSource API and multi-user environment.", "stars": "169"}]}, {"name": "Docker Images", "parent": "Resources", "entries": [{"name": "apache/spark", "url": "https://hub.docker.com/r/apache/spark", "description": "Apache Spark Official Docker images."}, {"name": "jupyter/docker-stacks/pyspark-notebook", "url": "https://github.com/jupyter/docker-stacks/tree/master/pyspark-notebook", "description": "PySpark with Jupyter Notebook and Mesos client.", "stars": "8k"}, {"name": "sequenceiq/docker-spark", "url": "https://github.com/sequenceiq/docker-spark", "description": "Yarn images from [SequenceIQ](http://www.sequenceiq.com/).", "stars": "765"}, {"name": "datamechanics/spark", "url": "https://hub.docker.com/r/datamechanics/spark", "description": "An easy to setup Docker image for Apache Spark from [Data Mechanics](https://www.datamechanics.co/)."}]}, {"name": "Miscellaneous", "parent": "Resources", "entries": [{"name": "Spark with Scala Gitter channel", "url": "https://gitter.im/spark-scala/Lobby", "description": "\"*A place to discuss and ask questions about using Scala for Spark programming*\" started by [@deanwampler](https://github.com/deanwampler)."}, {"name": "Apache Spark User List", "url": "http://apache-spark-user-list.1001560.n3.nabble.com/", "description": ""}]}], "name": ""}