jaeswift-website/api/data/awesomelist/sector_PRP-020.json

1 line
No EOL
194 KiB
JSON

{"code": "PRP-020", "name": "BIG DATA", "icon": "\ud83d\udcca", "lists": [{"name": "Bigdata", "subcategories": [{"name": "Bigdata", "entries": [{"name": "Awesome Big Data", "url": "#awesome-big-data", "description": ""}, {"name": "Other Awesome Lists", "url": "#other-awesome-lists", "description": ""}, {"name": "MySQL", "url": "https://www.mysql.com/", "description": ""}, {"name": "PostgreSQL", "url": "https://www.postgresql.org/", "description": ""}, {"name": "Oracle Database", "url": "http://www.oracle.com/us/corporate/features/database-12c/index.html", "description": "object-relational database management system."}, {"name": "Teradata", "url": "http://www.teradata.com/products-and-services/teradata-database/", "description": "high-performance MPP data warehouse platform."}, {"name": "Bistro", "url": "https://github.com/facebook/bistro", "description": "general-purpose data processing engine for both batch and stream analytics. It is based on a novel data model, which represents data via *functions* and processes data via *column operations* as opposed to having only set operations in conventional approaches like MapReduce or SQL.", "stars": "1k"}, {"name": "IBM Streams", "url": "https://www.ibm.com/analytics/us/en/technology/stream-computing/", "description": "platform for distributed processing and real-time analytics. Integrates with many of the popular technologies in the Big Data ecosystem (Kafka, HDFS, Spark, etc.)"}, {"name": "Apache Hadoop", "url": "http://hadoop.apache.org/", "description": "framework for distributed processing. Integrates\u00a0MapReduce (parallel processing), YARN (job scheduling) and HDFS (distributed file system)."}, {"name": "Tigon", "url": "https://github.com/caskdata/tigon", "description": "High Throughput Real-time Stream Processing Framework.", "stars": "285"}, {"name": "Pachyderm", "url": "http://pachyderm.io/", "description": "Pachyderm is a data storage platform built on Docker and Kubernetes to provide reproducible data processing and analysis."}, {"name": "Polyaxon", "url": "https://github.com/polyaxon/polyaxon", "description": "A platform for reproducible and scalable machine learning and deep learning.", "stars": "3.7k"}, {"name": "Smooks", "url": "https://github.com/smooks/smooks", "description": "An extensible Java framework for building XML and non-XML (CSV, EDI, Java, etc...) streaming applications.", "stars": "415"}, {"name": "AddThis Hydra", "url": "https://github.com/addthis/hydra", "description": "distributed data processing and storage system originally developed at AddThis.", "stars": "436"}, {"name": "AMPLab SIMR", "url": "http://databricks.github.io/simr/", "description": "run Spark on Hadoop MapReduce v1."}, {"name": "Apache APEX", "url": "https://apex.apache.org/", "description": "a unified, enterprise platform for big data stream and batch processing."}, {"name": "Apache Beam", "url": "https://beam.apache.org/", "description": "an unified model and set of language-specific SDKs for defining and executing data processing workflows."}, {"name": "Apache Crunch", "url": "http://crunch.apache.org/", "description": "a simple Java API for tasks like joining and data aggregation that are tedious to implement on plain MapReduce."}, {"name": "Apache DataFu", "url": "http://incubator.apache.org/projects/datafu.html", "description": "collection of user-defined functions for\u00a0Hadoop and Pig developed by LinkedIn."}, {"name": "Apache Flink", "url": "http://flink.apache.org/", "description": "high-performance runtime, and automatic program optimization."}, {"name": "Apache Gearpump", "url": "http://gearpump.apache.org/", "description": "real-time big data streaming engine based on Akka."}, {"name": "Apache Gora", "url": "http://gora.apache.org/", "description": "framework for in-memory data model and persistence."}, {"name": "Apache Hama", "url": "http://hama.apache.org/", "description": "BSP (Bulk Synchronous Parallel) computing framework."}, {"name": "Apache MapReduce", "url": "https://wiki.apache.org/hadoop/MapReduce/", "description": "programming model for processing large data sets with a parallel, distributed algorithm on a cluster."}, {"name": "Apache Pig", "url": "https://pig.apache.org/", "description": "high level language to express data analysis programs for Hadoop."}, {"name": "Apache REEF", "url": "http://reef.apache.org/", "description": "retainable evaluator execution framework to simplify and unify the lower layers of big data systems."}, {"name": "Apache S4", "url": "http://incubator.apache.org/projects/s4.html", "description": "framework for stream processing, implementation of S4."}, {"name": "Apache Spark", "url": "http://spark.apache.org/", "description": "framework for\u00a0in-memory cluster computing."}, {"name": "Apache Spark Streaming", "url": "https://spark.apache.org/docs/latest/streaming-programming-guide.html", "description": "framework for stream processing, part of Spark."}, {"name": "Apache Storm", "url": "http://storm.apache.org", "description": "framework for stream processing by Twitter also on YARN."}, {"name": "Apache Samza", "url": "http://samza.apache.org/", "description": "stream processing framework, based on Kafka and YARN."}, {"name": "Apache Tez", "url": "http://tez.apache.org/", "description": "application framework\u00a0for executing a complex DAG (directed acyclic graph) of tasks, built on\u00a0YARN."}, {"name": "Apache Twill", "url": "https://incubator.apache.org/projects/twill.html", "description": "abstraction over YARN that reduces the complexity of developing distributed applications."}, {"name": "Baidu Bigflow", "url": "http://bigflow.cloud/en/index.html", "description": "an interface that allows for writing distributed computing programs providing lots of simple, flexible, powerful APIs to easily handle data of any scale."}, {"name": "Cascalog", "url": "http://cascalog.org/", "description": "data processing and querying library."}, {"name": "Cheetah", "url": "http://vldbarc.org/pvldb/vldb2010/pvldb_vol3/I08.pdf", "description": "High Performance, Custom Data Warehouse on Top of MapReduce."}, {"name": "Concurrent Cascading", "url": "http://www.cascading.org/", "description": "framework for data management/analytics on Hadoop."}, {"name": "Damballa Parkour", "url": "https://github.com/damballa/parkour", "description": "MapReduce library for Clojure.", "stars": "255"}, {"name": "Datasalt Pangool", "url": "https://github.com/datasalt/pangool", "description": "alternative MapReduce paradigm.", "stars": "56"}, {"name": "DataTorrent StrAM", "url": "https://www.datatorrent.com/", "description": "real-time engine is designed to enable distributed, asynchronous, real time in-memory big-data computations in as unblocked a way as possible, with minimal overhead and impact on performance."}, {"name": "Facebook Corona", "url": "https://www.facebook.com/notes/facebook-engineering/under-the-hood-scheduling-mapreduce-jobs-more-efficiently-with-corona/10151142560538920", "description": "Hadoop enhancement which removes single point of failure."}, {"name": "Facebook Peregrine", "url": "http://peregrine_mapreduce.bitbucket.org/", "description": "Map Reduce framework."}, {"name": "Facebook Scuba", "url": "https://www.facebook.com/notes/facebook-engineering/under-the-hood-data-diving-with-scuba/10150599692628920", "description": "distributed in-memory datastore."}, {"name": "Google Dataflow", "url": "https://googledevelopers.blogspot.it/2014/06/cloud-platform-at-google-io-new-big.html", "description": "create data pipelines to help them\u00e6ingest, transform and analyze data."}, {"name": "Google MapReduce", "url": "https://research.google.com/archive/mapreduce.html", "description": "map reduce framework."}, {"name": "Google MillWheel", "url": "https://research.google.com/pubs/pub41378.html", "description": "fault tolerant stream processing framework."}, {"name": "IBM Streams", "url": "https://www.ibm.com/analytics/us/en/technology/stream-computing/", "description": "platform for distributed processing and real-time analytics. Provides toolkits for advanced analytics like geospatial, time series, etc. out of the box."}, {"name": "JAQL", "url": "https://code.google.com/p/jaql/", "description": "declarative programming language for working with structured, semi-structured and unstructured data."}, {"name": "Kite", "url": "http://kitesdk.org/docs/current/", "description": "is a set of libraries, tools, examples, and documentation focused on making it easier to build systems on top of the Hadoop ecosystem."}, {"name": "Metamarkets Druid", "url": "http://druid.io/", "description": "framework for real-time analysis of large datasets."}, {"name": "Netflix PigPen", "url": "https://github.com/Netflix/PigPen", "description": "map-reduce for Clojure which compiles to Apache Pig.", "stars": "565"}, {"name": "Nokia Disco", "url": "http://discoproject.org/", "description": "MapReduce framework developed by Nokia."}, {"name": "Onyx", "url": "http://www.onyxplatform.org/", "description": "Distributed computation for the cloud."}, {"name": "Pinterest Pinlater", "url": "https://medium.com/@Pinterest_Engineering/pinlater-an-asynchronous-job-execution-system-b8664cb8aa7d", "description": "asynchronous job execution system."}, {"name": "Pydoop", "url": "http://crs4.github.io/pydoop/", "description": "Python MapReduce and HDFS API for Hadoop."}, {"name": "Ray", "url": "https://github.com/ray-project/ray", "description": "A fast and simple framework for building and running distributed applications.", "stars": "41k"}, {"name": "Rackerlabs Blueflood", "url": "http://blueflood.io/", "description": "multi-tenant distributed metric processing system"}, {"name": "Skale", "url": "https://github.com/skale-me/skale-engine", "description": "High performance distributed data processing in NodeJS.", "stars": "397"}, {"name": "Stratosphere", "url": "http://stratosphere.eu/", "description": "general purpose cluster computing framework."}, {"name": "Streamdrill", "url": "https://streamdrill.com/", "description": "useful for counting activities of event streams over different time windows and finding the most active one."}, {"name": "streamsx.topology", "url": "https://github.com/IBMStreams/streamsx.topology", "description": "Libraries to enable building IBM Streams application in Java, Python or Scala.", "stars": "28"}, {"name": "Tuktu", "url": "https://github.com/UnderstandLingBV/Tuktu", "description": "Easy-to-use platform for batch and streaming computation, built using Scala, Akka and Play!", "stars": "60"}, {"name": "Twitter Heron", "url": "https://github.com/twitter/heron", "description": "Heron is a realtime, distributed, fault-tolerant stream processing engine from Twitter replacing Storm.", "stars": "3.7k"}, {"name": "Twitter Scalding", "url": "https://github.com/twitter/scalding", "description": "Scala library for Map Reduce jobs, built on Cascading.", "stars": "3.5k"}, {"name": "Twitter Summingbird", "url": "https://github.com/twitter/summingbird", "description": "Streaming MapReduce with Scalding and Storm, by Twitter.", "stars": "2.1k"}, {"name": "Twitter TSAR", "url": "https://blog.twitter.com/engineering/en_us/a/2014/tsar-a-timeseries-aggregator.html", "description": "TimeSeries AggregatoR by Twitter."}, {"name": "Wallaroo", "url": "http://www.wallaroolabs.com/community", "description": "The ultrafast and elastic data processing engine. Big or fast data - no fuss, no Java needed."}, {"name": "Ambry", "url": "https://github.com/linkedin/ambry", "description": "a distributed object store that supports storage of trillion of small immutable objects as well as billions of large objects.", "stars": "1.8k"}, {"name": "Apache HDFS", "url": "http://hadoop.apache.org/", "description": "a way to store large files across multiple machines."}, {"name": "Apache Kudu", "url": "http://kudu.apache.org/", "description": "Hadoop's storage layer to enable fast analytics on fast data."}, {"name": "BeeGFS", "url": "https://www.beegfs.io/content/", "description": "formerly FhGFS, parallel distributed file system."}, {"name": "Ceph Filesystem", "url": "http://ceph.com/ceph-storage/file-system/", "description": "software storage platform designed."}, {"name": "Disco DDFS", "url": "http://disco.readthedocs.org/en/latest/howto/ddfs.html", "description": "distributed filesystem."}, {"name": "Facebook Haystack", "url": "https://www.facebook.com/note.php?note_id=76191543919", "description": "object storage system."}, {"name": "Google GFS", "url": "http://static.googleusercontent.com/media/research.google.com/en//archive/gfs-sosp2003.pdf", "description": "distributed filesystem."}, {"name": "Google Megastore", "url": "https://research.google.com/pubs/pub36971.html", "description": "scalable, highly available storage."}, {"name": "GridGain", "url": "https://www.gridgain.com/", "description": "GGFS, Hadoop compliant in-memory file system."}, {"name": "Lustre file system", "url": "http://wiki.lustre.org/", "description": "high-performance distributed filesystem."}, {"name": "Microsoft Azure Data Lake Store", "url": "https://hadoop.apache.org/docs/current/hadoop-azure-datalake/index.html", "description": "HDFS-compatible storage in Azure cloud"}, {"name": "Quantcast File System QFS", "url": "https://www.quantcast.com/about-us/quantcast-file-system/", "description": "open-source distributed file system."}, {"name": "Red Hat GlusterFS", "url": "http://gluster.org/", "description": "scale-out network-attached storage file system."}, {"name": "Seaweed-FS", "url": "https://github.com/chrislusf/seaweedfs", "description": "simple and highly scalable distributed file system.", "stars": "7"}, {"name": "Alluxio", "url": "http://www.alluxio.org/", "description": "reliable file sharing at memory speed across cluster frameworks."}, {"name": "Tahoe-LAFS", "url": "https://www.tahoe-lafs.org/trac/tahoe-lafs", "description": "decentralized cloud storage system."}, {"name": "Baidu File System", "url": "https://github.com/baidu/bfs", "description": "distributed filesystem.", "stars": "2.9k"}, {"name": "Pilosa", "url": "https://github.com/pilosa/pilosa", "description": "", "stars": "2.5k"}, {"name": "Actian Versant", "url": "https://www.actian.com/data-management/ingres-sql-rdbms/", "description": "commercial object-oriented database management systems ."}, {"name": "Crate Data", "url": "https://crate.io/", "description": "is an open source massively scalable data store. It requires zero administration."}, {"name": "Facebook Apollo", "url": "http://www.infoq.com/news/2014/06/facebook-apollo", "description": "Facebook\u2019s Paxos-like NoSQL database."}, {"name": "jumboDB", "url": "http://comsysto.github.io/jumbodb/", "description": "document oriented datastore over Hadoop."}, {"name": "LinkedIn Espresso", "url": "https://engineering.linkedin.com/data", "description": "horizontally scalable document-oriented NoSQL data store."}, {"name": "MarkLogic", "url": "http://www.marklogic.com/", "description": "Schema-agnostic Enterprise NoSQL database technology."}, {"name": "Microsoft Azure DocumentDB", "url": "https://azure.microsoft.com/en-us/services/cosmos-db/", "description": "NoSQL cloud database service with protocol support for MongoDB"}, {"name": "MongoDB", "url": "https://www.mongodb.com/", "description": "Document-oriented database system."}, {"name": "RavenDB", "url": "https://ravendb.net/", "description": "A transactional, open-source Document Database."}, {"name": "RethinkDB", "url": "https://rethinkdb.com/", "description": "document database that supports queries like table joins and group by."}, {"name": "Apache Accumulo", "url": "http://accumulo.apache.org/", "description": "distributed key/value store, built on\u00a0Hadoop."}, {"name": "Apache Cassandra", "url": "http://cassandra.apache.org/", "description": "column-oriented distributed datastore, inspired by\u00a0BigTable."}, {"name": "Apache HBase", "url": "http://hbase.apache.org/", "description": "column-oriented distributed datastore, inspired by BigTable."}, {"name": "Baidu Tera", "url": "https://github.com/baidu/tera", "description": "an Internet-scale database, inspired by BigTable.", "stars": "1.9k"}, {"name": "Facebook HydraBase", "url": "https://code.facebook.com/posts/321111638043166/hydrabase-the-evolution-of-hbase-facebook/", "description": "evolution of HBase made by Facebook."}, {"name": "Google BigTable", "url": "http://static.googleusercontent.com/media/research.google.com/en//archive/bigtable-osdi06.pdf", "description": "column-oriented distributed datastore."}, {"name": "Google Cloud Datastore", "url": "https://cloud.google.com/datastore/docs/concepts/overview", "description": "is a fully managed, schemaless database for storing non-relational data over BigTable."}, {"name": "Hypertable", "url": "http://www.hypertable.org/", "description": "column-oriented distributed datastore, inspired by\u00a0BigTable."}, {"name": "InfiniDB", "url": "https://github.com/infinidb/infinidb/", "description": "is accessed through a MySQL interface and use massive parallel processing to parallelize queries.", "stars": "247"}, {"name": "Tephra", "url": "https://github.com/caskdata/tephra", "description": "Transactions for HBase.", "stars": "158"}, {"name": "Twitter Manhattan", "url": "https://blog.twitter.com/engineering/en_us/a/2014/manhattan-our-real-time-multi-tenant-distributed-database-for-twitter-scale.html", "description": "real-time, multi-tenant distributed database for Twitter scale."}, {"name": "ScyllaDB", "url": "http://www.scylladb.com/", "description": "column-oriented distributed datastore written in C++, totally compatible with Apache Cassandra."}, {"name": "Aerospike", "url": "http://www.aerospike.com/", "description": "NoSQL flash-optimized, in-memory. Open source and \"Server code in 'C' (not Java or Erlang) precisely tuned to avoid context switching and memory copies.\""}, {"name": "Amazon DynamoDB", "url": "https://aws.amazon.com/dynamodb/", "description": "distributed key/value store, implementation of\u00a0Dynamo paper."}, {"name": "Badger", "url": "https://open.dgraph.io/post/badger/", "description": "a fast, simple, efficient, and persistent key-value store written natively in Go."}, {"name": "Bolt", "url": "https://github.com/boltdb/bolt", "description": "an embedded key-value database for Go.", "stars": "15k"}, {"name": "BTDB", "url": "https://github.com/Bobris/BTDB", "description": "Key Value Database in .Net with Object DB Layer, RPC, dynamic IL and much more", "stars": "140"}, {"name": "BuntDB", "url": "https://github.com/tidwall/buntdb", "description": "a fast, embeddable, in-memory key/value database for Go with custom indexing and geospatial support.", "stars": "4.8k"}, {"name": "Edis", "url": "https://github.com/cbd/edis", "description": "is a protocol-compatible Server replacement for Redis.", "stars": "559"}, {"name": "ElephantDB", "url": "https://github.com/nathanmarz/elephantdb", "description": "Distributed database specialized in exporting data from Hadoop.", "stars": "558"}, {"name": "EventStore", "url": "https://geteventstore.com/", "description": "distributed time series database."}, {"name": "GhostDB", "url": "https://github.com/jakekgrog/GhostDB", "description": "a distributed, in-memory, general purpose key-value data store that delivers microsecond performance at any scale.", "stars": "753"}, {"name": "Graviton", "url": "https://github.com/deroproject/graviton", "description": "a simple, fast, versioned, authenticated, embeddable key-value store database in pure Go(lang).", "stars": "424"}, {"name": "GridDB", "url": "https://github.com/griddb/griddb_nosql", "description": "suitable for sensor data stored in a timeseries.", "stars": "2.5k"}, {"name": "HyperDex", "url": "https://github.com/rescrv/HyperDex", "description": "a scalable, next generation key-value and document store with a wide array of features, including consistency, fault tolerance and high performance.", "stars": "1.4k"}, {"name": "Ignite", "url": "https://ignite.apache.org/index.html", "description": "is an in-memory key-value data store providing full SQL-compliant data access that can optionally be backed by disk storage."}, {"name": "LinkedIn Krati", "url": "https://github.com/linkedin-sna/sna-page/tree/master/krati", "description": "is a simple persistent data store with very low latency and high throughput.", "stars": "26"}, {"name": "Linkedin Voldemort", "url": "http://www.project-voldemort.com/voldemort/", "description": "distributed key/value storage system."}, {"name": "Oracle NoSQL Database", "url": "http://www.oracle.com/technetwork/database/database-technologies/nosqldb/overview/index.html", "description": "distributed key-value database by Oracle Corporation."}, {"name": "Redis", "url": "https://redis.io/", "description": "in memory key value datastore."}, {"name": "Riak", "url": "https://github.com/basho/riak", "description": "a decentralized datastore.", "stars": "4k"}, {"name": "Storehaus", "url": "https://github.com/twitter/storehaus", "description": "library to work with asynchronous key value stores, by Twitter.", "stars": "465"}, {"name": "SummitDB", "url": "https://github.com/tidwall/summitdb", "description": "an in-memory, NoSQL key/value database, with disk persistence and using the Raft consensus algorithm.", "stars": "1.4k"}, {"name": "Tarantool", "url": "https://github.com/tarantool/tarantool", "description": "an efficient NoSQL database and a Lua application server.", "stars": "3.6k"}, {"name": "TiKV", "url": "https://github.com/pingcap/tikv", "description": "a distributed key-value database powered by Rust and inspired by Google Spanner and HBase.", "stars": "17k"}, {"name": "Tile38", "url": "https://github.com/tidwall/tile38", "description": "a geolocation data store, spatial index, and realtime geofence, supporting a variety of object types including latitude/longitude points, bounding boxes, XYZ tiles, Geohashes, and GeoJSON", "stars": "9.6k"}, {"name": "TreodeDB", "url": "https://github.com/Treode/store", "description": "key-value store that's replicated and sharded and provides atomic multirow writes.", "stars": "175"}, {"name": "Actionbase", "url": "https://github.com/kakao/actionbase", "description": "a database for user interactions (likes, views, follows) with precomputed reads, supports HBase.", "stars": "190"}, {"name": "AgensGraph", "url": "http://www.agensgraph.com/", "description": "a new generation multi-model graph database for the modern complex data environment."}, {"name": "Apache Giraph", "url": "http://giraph.apache.org/", "description": "implementation of Pregel, based on Hadoop."}, {"name": "Apache Spark Bagel", "url": "http://spark.apache.org/docs/0.7.3/bagel-programming-guide.html", "description": "implementation of Pregel, part of Spark."}, {"name": "ArangoDB", "url": "https://www.arangodb.com/", "description": "multi model distributed database."}, {"name": "DGraph", "url": "https://github.com/dgraph-io/dgraph", "description": "A scalable, distributed, low latency, high throughput graph database aimed at providing Google production level scale and throughput, with low enough latency to be serving real time user queries, over terabytes of structured data.", "stars": "22k"}, {"name": "EliasDB", "url": "https://github.com/krotik/eliasdb", "description": "a lightweight graph based database that does not require any third-party libraries.", "stars": "1k"}, {"name": "Facebook TAO", "url": "https://www.facebook.com/notes/facebook-engineering/tao-the-power-of-the-graph/10151525983993920", "description": "TAO is the distributed data store that is widely used at facebook to store and serve the social graph."}, {"name": "GCHQ Gaffer", "url": "https://github.com/gchq/Gaffer", "description": "Gaffer by GCHQ is a framework that makes it easy to store large-scale graphs in which the nodes and edges have statistics.", "stars": "1.8k"}, {"name": "Google Cayley", "url": "https://github.com/cayleygraph/cayley", "description": "open-source graph database.", "stars": "15k"}, {"name": "Google Pregel", "url": "http://kowshik.github.io/JPregel/pregel_paper.pdf", "description": "graph processing framework."}, {"name": "GraphLab PowerGraph", "url": "https://turi.com/products/create/docs/", "description": "a core C++ GraphLab API and a collection of high-performance machine learning and data mining toolkits built on top of the GraphLab API."}, {"name": "GraphX", "url": "https://amplab.cs.berkeley.edu/publication/graphx-grades/", "description": "resilient Distributed Graph System on Spark."}, {"name": "Gremlin", "url": "https://github.com/tinkerpop/gremlin", "description": "graph traversal Language.", "stars": "2k"}, {"name": "Infovore", "url": "https://github.com/paulhoule/infovore", "description": "RDF-centric Map/Reduce framework.", "stars": "149"}, {"name": "Intel GraphBuilder", "url": "https://01.org/graphbuilder/", "description": "tools to construct large-scale graphs on top of Hadoop."}, {"name": "JanusGraph", "url": "http://janusgraph.org", "description": "open-source, distributed graph database"}, {"name": "MapGraph", "url": "https://www.blazegraph.com/mapgraph-technology/", "description": "Massively Parallel Graph processing on GPUs."}, {"name": "Microsoft Graph Engine", "url": "https://github.com/Microsoft/GraphEngine", "description": "a distributed in-memory data processing engine, underpinned by a strongly-typed in-memory key-value store and a general distributed computation engine.", "stars": "2.2k"}, {"name": "Neo4j", "url": "https://neo4j.com/", "description": "graph database written entirely in Java."}, {"name": "OrientDB", "url": "http://orientdb.com/", "description": "document and graph database."}, {"name": "Phoebus", "url": "https://github.com/xslogic/phoebus", "description": "framework for large scale graph processing.", "stars": "384"}, {"name": "Titan", "url": "http://thinkaurelius.github.io/titan/", "description": "distributed graph database, built over Cassandra."}, {"name": "Twitter FlockDB", "url": "https://github.com/twitter-archive/flockdb", "description": "distributed graph database.", "stars": "3.3k"}, {"name": "NodeXL", "url": "https://nodexl.codeplex.com/", "description": "A free, open-source template for Microsoft\u00ae Excel\u00ae 2007, 2010, 2013 and 2016 that makes it easy to explore network graphs."}, {"name": "Columnar Storage", "url": "http://the-paper-trail.org/blog/columnar-storage/", "description": "an explanation of what columnar storage is and when you might want it."}, {"name": "Actian Vector", "url": "http://www.actian.com/", "description": "column-oriented analytic database."}, {"name": "ClickHouse", "url": "https://clickhouse.yandex/", "description": "an open-source column-oriented database management system that allows generating analytical data reports in real time."}, {"name": "EventQL", "url": "http://eventql.io/", "description": "a distributed, column-oriented database built for large-scale event collection and analytics."}, {"name": "MonetDB", "url": "https://www.monetdb.org/", "description": "column store database."}, {"name": "Parquet", "url": "http://parquet.apache.org/", "description": "columnar storage format for Hadoop."}, {"name": "Pivotal Greenplum", "url": "https://pivotal.io/pivotal-greenplum", "description": "purpose-built, dedicated analytic data warehouse that offers a columnar engine as well as a traditional row-based one."}, {"name": "Vertica", "url": "https://www.vertica.com/", "description": "is designed to manage large, fast-growing volumes of data and provide very fast query performance when used for data warehouses."}, {"name": "SQream DB", "url": "http://sqream.com/", "description": "A GPU powered big data database, designed for analytics and data warehousing, with ANSI-92 compliant SQL, suitable for data sets from 10TB to 1PB."}, {"name": "Google BigQuery", "url": "https://cloud.google.com/bigquery/what-is-bigquery", "description": "Google's cloud offering backed by their pioneering work on Dremel."}, {"name": "Amazon Redshift", "url": "https://aws.amazon.com/redshift/", "description": "Amazon's cloud offering, also based on a columnar datastore backend."}, {"name": "IndexR", "url": "https://github.com/shunfei/indexr", "description": "an open-source columnar storage format for fast & realtime analytic with big data.", "stars": "453"}, {"name": "LocustDB", "url": "https://github.com/cswinter/LocustDB", "description": "an experimental analytics database aiming to set a new standard for query performance on commodity hardware.", "stars": "1.6k"}, {"name": "Actian Ingres", "url": "http://www.actian.com/products/operational-databases/", "description": "commercially supported, open-source SQL relational database management system."}, {"name": "ActorDB", "url": "https://github.com/biokoda/actordb", "description": "a distributed SQL database with the scalability of a KV store, while keeping the query capabilities of a relational database.", "stars": "1.9k"}, {"name": "Amazon RedShift", "url": "http://aws.amazon.com/redshift/", "description": "data warehouse service, based on PostgreSQL."}, {"name": "BayesDB", "url": "https://github.com/probcomp/BayesDB", "description": "statistic oriented SQL database.", "stars": "888"}, {"name": "Bedrock", "url": "http://bedrockdb.com/", "description": "a simple, modular, networked and distributed transaction layer built atop SQLite."}, {"name": "CitusDB", "url": "https://www.citusdata.com/", "description": "scales out PostgreSQL through sharding and replication."}, {"name": "Cockroach", "url": "https://github.com/cockroachdb/cockroach", "description": "Scalable, Geo-Replicated, Transactional Datastore.", "stars": "32k"}, {"name": "Comdb2", "url": "https://github.com/bloomberg/comdb2", "description": "a clustered RDBMS built on optimistic concurrency control techniques.", "stars": "1.5k"}, {"name": "Datomic", "url": "http://www.datomic.com/", "description": "distributed database designed to enable scalable, flexible and intelligent applications."}, {"name": "FoundationDB", "url": "https://foundationdb.com/", "description": "distributed database, inspired by\u00a0F1."}, {"name": "Google F1", "url": "https://research.google.com/pubs/pub41344.html", "description": "distributed SQL database built on Spanner."}, {"name": "Google Spanner", "url": "https://research.google.com/archive/spanner.html", "description": "globally distributed semi-relational database."}, {"name": "H-Store", "url": "http://hstore.cs.brown.edu/", "description": "is an experimental main-memory, parallel database management system that is optimized for on-line transaction processing (OLTP) applications."}, {"name": "Haeinsa", "url": "https://github.com/VCNC/haeinsa", "description": "linearly scalable multi-row, multi-table transaction library for HBase based on Percolator.", "stars": "158"}, {"name": "HandlerSocket", "url": "https://www.percona.com/doc/percona-server/5.5/performance/handlersocket.html", "description": "NoSQL plugin for MySQL/MariaDB."}, {"name": "InfiniSQL", "url": "http://www.infinisql.org/", "description": "infinity scalable RDBMS."}, {"name": "KarelDB", "url": "https://github.com/rayokota/kareldb", "description": "a relational database backed by Apache Kafka.", "stars": "390"}, {"name": "Map-D", "url": "https://www.mapd.com/", "description": "GPU in-memory database, big data analysis and visualization platform."}, {"name": "MemSQL", "url": "http://www.memsql.com/", "description": "in memory SQL database witho optimized columnar storage on flash."}, {"name": "NuoDB", "url": "http://www.nuodb.com/", "description": "SQL/ACID compliant distributed database."}, {"name": "Oracle TimesTen in-Memory Database", "url": "http://www.oracle.com/technetwork/database/database-technologies/timesten/overview/index.html", "description": "in-memory, relational database management system with persistence and recoverability."}, {"name": "Pivotal GemFire XD", "url": "http://gemfirexd.docs.pivotal.io/latest/", "description": "Low-latency, in-memory, distributed SQL data store. Provides SQL interface to in-memory table data, persistable in HDFS."}, {"name": "SAP HANA", "url": "https://hana.sap.com/abouthana.html", "description": "is an in-memory, column-oriented, relational database management system."}, {"name": "SenseiDB", "url": "http://senseidb.github.io/sensei/", "description": "distributed, realtime, semi-structured database."}, {"name": "Sky", "url": "http://skydb.io/", "description": "database used for flexible, high performance analysis of behavioral data."}, {"name": "SymmetricDS", "url": "http://www.symmetricds.org/", "description": "open source software for both file and database synchronization."}, {"name": "TiDB", "url": "https://github.com/pingcap/tidb", "description": "TiDB is a distributed SQL database. Inspired by the design of Google F1.", "stars": "40k"}, {"name": "VoltDB", "url": "https://www.voltdb.com/", "description": "claims to be fastest in-memory database."}, {"name": "yugabyteDB", "url": "https://github.com/YugaByte/yugabyte-db", "description": "open source, high-performance, distributed SQL database compatible with PostgreSQL.", "stars": "10k"}, {"name": "Axibase Time Series Database", "url": "http://axibase.com/products/axibase-time-series-database/", "description": "Integrated time series database on top of HBase with built-in visualization, rule-engine and SQL support."}, {"name": "Chronix", "url": "http://chronix.io/", "description": "a time series storage built to store time series highly compressed and for fast access times."}, {"name": "Cube", "url": "http://square.github.io/cube/", "description": "uses MongoDB to store time series data."}, {"name": "Heroic", "url": "https://spotify.github.io/heroic/#!/index", "description": "is a scalable time series database based on Cassandra and Elasticsearch."}, {"name": "InfluxDB", "url": "https://www.influxdata.com/", "description": "a time series database with optimised IO and queries, supports pgsql and influx wire protocols."}, {"name": "QuestDB", "url": "https://questdb.io/", "description": "high-performance, open-source SQL database for applications in financial services, IoT, machine learning, DevOps and observability."}, {"name": "IronDB", "url": "https://www.circonus.com/irondb/", "description": "scalable, general-purpose time series database."}, {"name": "Kairosdb", "url": "https://github.com/kairosdb/kairosdb", "description": "similar to OpenTSDB but allows for Cassandra.", "stars": "1.8k"}, {"name": "M3DB", "url": "http://m3db.github.io/m3/m3db/", "description": "a distributed time series database that can be used for storing realtime metrics at long retention."}, {"name": "Newts", "url": "https://opennms.github.io/newts/", "description": "a time series database based on Apache Cassandra."}, {"name": "TDengine", "url": "https://github.com/taosdata/TDengine/", "description": "a time series database in C utilizing unique features of IoT to improve read/write throughput and reduce space needed to store data", "stars": "25k"}, {"name": "OpenTSDB", "url": "http://opentsdb.net", "description": "distributed time series database on top of HBase."}, {"name": "Prometheus", "url": "https://prometheus.io/", "description": "a time series database and service monitoring system."}, {"name": "Beringei", "url": "https://github.com/facebookincubator/beringei", "description": "Facebook's in-memory time-series database.", "stars": "3.2k"}, {"name": "TrailDB", "url": "http://traildb.io/", "description": "an efficient tool for storing and querying series of events."}, {"name": "Druid", "url": "https://github.com/druid-io/druid/", "description": "", "stars": "14k"}, {"name": "Riak-TS", "url": "http://basho.com/products/riak-ts/", "description": ""}, {"name": "Akumuli", "url": "https://github.com/akumuli/Akumuli", "description": "", "stars": "840"}, {"name": "Rhombus", "url": "https://github.com/Pardot/Rhombus", "description": ""}, {"name": "Dalmatiner DB", "url": "https://github.com/dalmatinerdb/dalmatinerdb", "description": "", "stars": "690"}, {"name": "Blueflood", "url": "https://github.com/rackerlabs/blueflood", "description": "", "stars": "597"}, {"name": "Timely", "url": "https://github.com/NationalSecurityAgency/timely", "description": "", "stars": "386"}, {"name": "SiriDB", "url": "https://github.com/transceptor-technology/siridb-server", "description": "", "stars": "510"}, {"name": "Thanos", "url": "https://github.com/improbable-eng/thanos", "description": "Thanos is a set of components to create a highly available metric system with unlimited storage capacity using multiple (existing) Prometheus deployments.", "stars": "14k"}, {"name": "VictoriaMetrics", "url": "https://github.com/VictoriaMetrics/VictoriaMetrics", "description": "fast, scalable and resource-effective open-source TSDB compatible with Prometheus. Single-node and cluster versions included", "stars": "16k"}, {"name": "Actian SQL for Hadoop", "url": "http://www.actian.com/analytic-database/vectorh-sql-hadoop", "description": "high performance interactive SQL access to all Hadoop data."}, {"name": "Apache Drill", "url": "http://drill.apache.org/", "description": "framework for interactive analysis, inspired by Dremel."}, {"name": "Apache HCatalog", "url": "https://cwiki.apache.org/confluence/display/Hive/HCatalog", "description": "table and storage management layer for Hadoop."}, {"name": "Apache Hive", "url": "http://hive.apache.org/", "description": "SQL-like data warehouse system for Hadoop."}, {"name": "Apache Calcite", "url": "http://calcite.apache.org/", "description": "framework that allows efficient translation of queries involving heterogeneous and federated data."}, {"name": "Apache Phoenix", "url": "http://phoenix.apache.org/index.html", "description": "SQL skin over HBase."}, {"name": "Aster Database", "url": "http://www.teradata.com/products-and-services/Teradata-Aster/teradata-aster-database", "description": "SQL-like analytic processing for MapReduce."}, {"name": "Cloudera Impala", "url": "https://www.cloudera.com/products/apache-hadoop/impala.html", "description": "framework for interactive analysis, Inspired by Dremel."}, {"name": "Concurrent Lingual", "url": "http://www.cascading.org/projects/lingual/", "description": "SQL-like query language for Cascading."}, {"name": "Datasalt Splout SQL", "url": "http://www.datasalt.com/products/splout-sql/", "description": "full SQL query engine for big datasets."}, {"name": "Dremio", "url": "https://www.dremio.com/", "description": "an open-source, SQL-like Data-as-a-Service Platform based on Apache Arrow."}, {"name": "Facebook PrestoDB", "url": "https://prestodb.io/", "description": "distributed SQL query engine."}, {"name": "Google BigQuery", "url": "https://research.google.com/pubs/pub36632.html", "description": "framework for interactive analysis, implementation of Dremel."}, {"name": "Iceberg", "url": "https://iceberg.apache.org/", "description": "an open table format for huge analytic datasets. Iceberg adds tables to Trino and Spark that use a high-performance format that works just like a SQL table."}, {"name": "Materialize", "url": "https://github.com/materializeinc/materialize", "description": "is a streaming database for real-time applications using SQL for queries and supporting a large fraction of PostgreSQL.", "stars": "6.2k"}, {"name": "Invantive SQL", "url": "https://documentation.invantive.com/2017R2/invantive-sql-grammar/invantive-sql-grammar-17.30.html", "description": "SQL engine for online and on-premise use with integrated local data replication and 70+ connectors."}, {"name": "PipelineDB", "url": "https://www.pipelinedb.com/", "description": "an open-source relational database that runs SQL queries continuously on streams, incrementally storing results in tables."}, {"name": "Pivotal HDB", "url": "https://pivotal.io/pivotal-hdb", "description": "SQL-like data warehouse system for\u00a0Hadoop."}, {"name": "RainstorDB", "url": "http://rainstor.com/products/rainstor-database/", "description": "database for storing petabyte-scale volumes of structured and semi-structured data."}, {"name": "Spark Catalyst", "url": "https://github.com/apache/spark/tree/master/sql", "description": "is a Query Optimization Framework for Spark and Shark.", "stars": "43k"}, {"name": "SparkSQL", "url": "https://databricks.com/blog/2014/03/26/spark-sql-manipulating-structured-data-using-spark-2.html", "description": "Manipulating Structured Data Using Spark."}, {"name": "Splice Machine", "url": "https://www.splicemachine.com/", "description": "a full-featured SQL-on-Hadoop RDBMS with ACID transactions."}, {"name": "Stinger", "url": "https://hortonworks.com/innovation/stinger/", "description": "interactive query for Hive."}, {"name": "Tajo", "url": "http://tajo.apache.org/", "description": "distributed data warehouse system on Hadoop."}, {"name": "Trafodion", "url": "https://wiki.trafodion.org/wiki/index.php/Main_Page", "description": "enterprise-class SQL-on-HBase solution targeting big data transactional or operational workloads."}, {"name": "redpanda", "url": "https://vectorized.io/redpanda", "description": "A Kafka\u00ae replacement for mission critical systems; 10x faster. Written in C++."}, {"name": "Amazon Kinesis", "url": "https://aws.amazon.com/kinesis/", "description": "real-time processing of streaming data at massive scale."}, {"name": "Amazon Web Services Glue", "url": "https://aws.amazon.com/glue/", "description": "serverless fully managed extract, transform, and load (ETL) service"}, {"name": "Census", "url": "https://getcensus.com/", "description": "A reverse ETL product that let you sync data from your data warehouse to SaaS Applications. No engineering favors required\u2014just SQL."}, {"name": "Apache Chukwa", "url": "http://chukwa.apache.org/", "description": "data collection system."}, {"name": "Apache Flume", "url": "http://flume.apache.org/", "description": "service to manage large amount of log data."}, {"name": "Apache Kafka", "url": "http://kafka.apache.org/", "description": "distributed publish-subscribe messaging system."}, {"name": "Apache NiFi", "url": "https://nifi.apache.org/", "description": "Apache NiFi is an integrated data logistics platform for automating the movement of data between disparate systems."}, {"name": "Apache Pulsar", "url": "https://github.com/apache/pulsar", "description": "a distributed pub-sub messaging platform with a very flexible messaging model and an intuitive client API.", "stars": "15k"}, {"name": "Apache Sqoop", "url": "http://sqoop.apache.org/", "description": "tool to transfer data between Hadoop and a structured datastore."}, {"name": "Embulk", "url": "http://www.embulk.org", "description": "open-source bulk data loader that helps data transfer between various databases, storages, file formats, and cloud services."}, {"name": "Estuary", "url": "https://estuary.dev", "description": "SaaS platform based on Gazette with plug-and-play connectors."}, {"name": "Facebook Scribe", "url": "https://github.com/facebookarchive/scribe", "description": "streamed log data aggregator.", "stars": "3.9k"}, {"name": "Fluentd", "url": "http://www.fluentd.org", "description": "tool to collect events and logs."}, {"name": "Gazette", "url": "https://github.com/gazette/core", "description": "Distributed streaming infrastructure built on cloud storage which makes it easy to mix and match batch and streaming paradigms.", "stars": "785"}, {"name": "Google Photon", "url": "https://research.google.com/pubs/pub41318.html", "description": "geographically distributed system for joining multiple continuously flowing streams of data in real-time with high scalability and low latency."}, {"name": "Heka", "url": "https://github.com/mozilla-services/heka", "description": "open source stream processing software system.", "stars": "3.5k"}, {"name": "HIHO", "url": "https://github.com/sonalgoyal/hiho", "description": "framework for connecting disparate data sources with Hadoop.", "stars": "90"}, {"name": "Kestrel", "url": "https://github.com/papertrail/kestrel", "description": "distributed message queue system."}, {"name": "LinkedIn Databus", "url": "https://engineering.linkedin.com/data", "description": "stream of change capture events for a database."}, {"name": "LinkedIn Kamikaze", "url": "https://github.com/linkedin/kamikaze", "description": "utility package for compressing sorted integer arrays.", "stars": "22"}, {"name": "LinkedIn White Elephant", "url": "https://github.com/linkedin/white-elephant", "description": "log aggregator and dashboard.", "stars": "190"}, {"name": "Logstash", "url": "https://www.elastic.co/products/logstash", "description": "a tool for managing events and logs."}, {"name": "Netflix Suro", "url": "https://github.com/Netflix/suro", "description": "log agregattor like Storm and Samza based on Chukwa.", "stars": "796"}, {"name": "Pinterest Secor", "url": "https://github.com/pinterest/secor", "description": "is a service implementing Kafka log persistance.", "stars": "1.9k"}, {"name": "Linkedin Gobblin", "url": "https://github.com/linkedin/gobblin", "description": "linkedin's universal data ingestion framework.", "stars": "2.3k"}, {"name": "Skizze", "url": "https://github.com/skizzehq/skizze", "description": "sketch data store to deal with all problems around counting and sketching using probabilistic data-structures.", "stars": "773"}, {"name": "StreamSets Data Collector", "url": "https://github.com/streamsets/datacollector", "description": "continuous big data ingest infrastructure with a simple to use IDE."}, {"name": "Alooma", "url": "https://www.alooma.com/integrations/mysql", "description": "data pipeline as a service enabling moving data sources such as MySQL into data warehouses."}, {"name": "RudderStack", "url": "https://github.com/rudderlabs/rudder-server", "description": "an open source customer data infrastructure (segment, mParticle alternative) written in go.", "stars": "4.4k"}, {"name": "Zilla", "url": "https://github.com/aklivity/zilla", "description": "An API gateway built for event-driven architectures and streaming that supports standard protocols such as HTTP, SSE, gRPC, MQTT and the native Kafka protocol.", "stars": "676"}, {"name": "Akka Toolkit", "url": "http://akka.io/", "description": "runtime for distributed, and fault tolerant event-driven applications on the JVM."}, {"name": "Apache Avro", "url": "http://avro.apache.org/", "description": "data serialization system."}, {"name": "Apache Curator", "url": "http://curator.apache.org/", "description": "Java libraries for Apache ZooKeeper."}, {"name": "Apache Karaf", "url": "http://karaf.apache.org/", "description": "OSGi runtime that runs on top of any OSGi framework."}, {"name": "Apache Thrift", "url": "http://thrift.apache.org//", "description": "framework to build binary protocols."}, {"name": "Apache Zookeeper", "url": "http://zookeeper.apache.org/", "description": "centralized service for process management."}, {"name": "Google Chubby", "url": "https://research.google.com/archive/chubby.html", "description": "a lock service for loosely-coupled distributed systems."}, {"name": "Hydrosphere Mist", "url": "https://github.com/Hydrospheredata/mist", "description": "a service for exposing Apache Spark analytics jobs and machine learning models as realtime, batch or reactive web services.", "stars": "324"}, {"name": "Linkedin Norbert", "url": "https://engineering.linkedin.com/data", "description": "cluster manager."}, {"name": "Mara", "url": "https://github.com/mara/data-integration", "description": "A lightweight opinionated ETL framework, halfway between plain scripts and Apache Airflow", "stars": "2.1k"}, {"name": "OpenMPI", "url": "https://www.open-mpi.org/", "description": "message passing framework."}, {"name": "Serf", "url": "https://www.serf.io/", "description": "decentralized solution for service discovery and orchestration."}, {"name": "Spotify Luigi", "url": "https://github.com/spotify/luigi", "description": "a Python package for building complex pipelines of batch jobs. It handles dependency resolution, workflow management, visualization, handling failures, command line integration, and much more.", "stars": "19k"}, {"name": "Spring XD", "url": "https://github.com/spring-projects/spring-xd", "description": "distributed and extensible system for data ingestion, real time analytics, batch processing, and data export.", "stars": "477"}, {"name": "Twitter Elephant Bird", "url": "https://github.com/twitter/elephant-bird", "description": "libraries for working with LZOP-compressed data.", "stars": "1.1k"}, {"name": "Twitter Finagle", "url": "https://twitter.github.io/finagle/", "description": "asynchronous network stack for the JVM."}, {"name": "Apache Airflow", "url": "https://github.com/apache/incubator-airflow", "description": "a platform to programmatically author, schedule and monitor workflows.", "stars": "44k"}, {"name": "Apache Aurora", "url": "http://aurora.apache.org/", "description": "is a service scheduler that runs on top of Apache Mesos."}, {"name": "Apache Falcon", "url": "http://falcon.apache.org/", "description": "data management framework."}, {"name": "Apache Oozie", "url": "http://oozie.apache.org/", "description": "workflow job scheduler."}, {"name": "Azure Data Factory", "url": "https://docs.microsoft.com/en-us/azure/data-factory/data-factory-introduction", "description": "cloud-based pipeline orchestration for on-prem, cloud and HDInsight"}, {"name": "Chronos", "url": "http://mesos.github.io/chronos/", "description": "distributed and fault-tolerant scheduler."}, {"name": "Cronicle", "url": "https://github.com/jhuckaby/Cronicle", "description": "Distributed, easy to install, NodeJS based, task scheduler", "stars": "5.4k"}, {"name": "Dagster", "url": "https://github.com/dagster-io/dagster", "description": "a data orchestrator for machine learning, analytics, and ETL.", "stars": "15k"}, {"name": "Linkedin Azkaban", "url": "https://azkaban.github.io/", "description": "batch workflow job scheduler."}, {"name": "Schedoscope", "url": "https://github.com/ottogroup/schedoscope", "description": "Scala DSL for agile scheduling of Hadoop jobs.", "stars": "96"}, {"name": "Sparrow", "url": "https://github.com/radlab/sparrow", "description": "scheduling platform.", "stars": "328"}, {"name": "Azure ML Studio", "url": "https://studio.azureml.net/", "description": "Cloud-based AzureML, R, Python Machine Learning platform"}, {"name": "brain", "url": "https://github.com/harthur/brain", "description": "Neural networks in JavaScript.", "stars": "8k"}, {"name": "Oryx", "url": "https://github.com/OryxProject/oryx", "description": "Lambda architecture on Apache Spark, Apache Kafka for real-time large scale machine learning.", "stars": "1.8k"}, {"name": "Concurrent Pattern", "url": "http://www.cascading.org/projects/pattern/", "description": "machine learning library for Cascading."}, {"name": "convnetjs", "url": "https://github.com/karpathy/convnetjs", "description": "Deep Learning in Javascript. Train Convolutional Neural Networks (or ordinary ones) in your browser.", "stars": "11k"}, {"name": "DataVec", "url": "https://github.com/deeplearning4j/DataVec", "description": "A vectorization and data preprocessing library for deep learning in Java and Scala. Part of the Deeplearning4j ecosystem."}, {"name": "Deeplearning4j", "url": "https://github.com/deeplearning4j", "description": "Fast, open deep learning for the JVM (Java, Scala, Clojure). A neural network configuration layer powered by a C++ library. Uses Spark and Hadoop to train nets on multiple GPUs and CPUs."}, {"name": "Decider", "url": "https://github.com/danielsdeleo/Decider", "description": "Flexible and Extensible Machine Learning in Ruby.", "stars": "383"}, {"name": "ENCOG", "url": "http://www.heatonresearch.com/encog/", "description": "machine learning framework that supports a variety of advanced algorithms, as well as support classes to normalize and process data."}, {"name": "etcML", "url": "http://www.etcml.com/", "description": "text classification with machine learning."}, {"name": "Etsy Conjecture", "url": "https://github.com/etsy/Conjecture", "description": "scalable Machine Learning in Scalding.", "stars": "359"}, {"name": "Feast", "url": "https://github.com/gojek/feast", "description": "A feature store for the management, discovery, and access of machine learning features. Feast provides a consistent view of feature data for both model training and model serving.", "stars": "6.7k"}, {"name": "GraphLab Create", "url": "https://dato.com/products/create/", "description": "A machine learning platform in Python with a broad collection of ML toolkits, data engineering, and deployment tools."}, {"name": "H2O", "url": "https://github.com/h2oai/h2o-3/", "description": "statistical, machine learning and math runtime with Hadoop. R and Python.", "stars": "7.5k"}, {"name": "Karate Club", "url": "https://github.com/benedekrozemberczki/karateclub", "description": "An unsupervised machine learning library for graph structured data. Python", "stars": "2.3k"}, {"name": "Keras", "url": "https://github.com/fchollet/keras", "description": "An intuitive neural net API inspired by Torch that runs atop Theano and Tensorflow.", "stars": "64k"}, {"name": "Lambdo", "url": "https://github.com/johnsonc/lambdo", "description": "Lambdo is a workflow engine which significantly simplifies the analysis process by unifying feature engineering and machine learning operations.", "stars": "1"}, {"name": "Little Ball of Fur", "url": "https://github.com/benedekrozemberczki/littleballoffur", "description": "A subsampling library for graph structured data. Python", "stars": "713"}, {"name": "Mahout", "url": "http://mahout.apache.org/", "description": "An Apache-backed machine learning library for Hadoop."}, {"name": "MLbase", "url": "http://www.mlbase.org/", "description": "distributed machine learning libraries for the BDAS stack."}, {"name": "MLPNeuralNet", "url": "https://github.com/nikolaypavlov/MLPNeuralNet", "description": "Fast multilayer perceptron neural network library for iOS and Mac OS X.", "stars": "902"}, {"name": "ML Workspace", "url": "https://github.com/ml-tooling/ml-workspace", "description": "All-in-one web-based IDE specialized for machine learning and data science.", "stars": "3.5k"}, {"name": "MOA", "url": "http://moa.cms.waikato.ac.nz", "description": "MOA performs big data stream mining in real time, and large scale machine learning."}, {"name": "MonkeyLearn", "url": "https://monkeylearn.com/", "description": "Text mining made easy. Extract and classify data from text."}, {"name": "ND4J", "url": "https://github.com/deeplearning4j/nd4j", "description": "A matrix library for the JVM. Numpy for Java."}, {"name": "nupic", "url": "https://github.com/numenta/nupic", "description": "Numenta Platform for Intelligent Computing: a brain-inspired machine intelligence platform, and biologically accurate neural network based on cortical learning algorithms.", "stars": "6.4k"}, {"name": "PredictionIO", "url": "http://predictionio.incubator.apache.org/index.html", "description": "machine learning server built on Hadoop, Mahout and Cascading."}, {"name": "PyTorch Geometric Temporal", "url": "https://github.com/benedekrozemberczki/pytorch_geometric_temporal", "description": "a temporal extension library for PyTorch Geometric .", "stars": "2.9k"}, {"name": "RL4J", "url": "https://github.com/deeplearning4j/rl4j", "description": "Reinforcement learning for Java and Scala. Includes Deep-Q learning and A3C algorithms, and integrates with Open AI's Gym. Runs in the Deeplearning4j ecosystem."}, {"name": "SAMOA", "url": "http://samoa.incubator.apache.org/", "description": "distributed streaming machine learning framework."}, {"name": "scikit-learn", "url": "https://github.com/scikit-learn/scikit-learn", "description": "scikit-learn: machine learning in Python.", "stars": "65k"}, {"name": "Shapley", "url": "https://github.com/benedekrozemberczki/shapley", "description": "A data-driven framework to quantify the value of classifiers in a machine learning ensemble.", "stars": "223"}, {"name": "Spark MLlib", "url": "http://spark.apache.org/docs/0.9.0/mllib-guide.html", "description": "a Spark implementation of some common machine learning (ML) functionality."}, {"name": "Sibyl", "url": "https://users.soe.ucsc.edu/~niejiazhong/slides/chandra.pdf", "description": "System for Large Scale Machine Learning at Google."}, {"name": "TensorFlow", "url": "https://github.com/tensorflow/tensorflow", "description": "Library from Google for machine learning using data flow graphs.", "stars": "193k"}, {"name": "Theano", "url": "https://github.com/theano", "description": "A Python-focused machine learning library supported by the University of Montreal."}, {"name": "Torch", "url": "https://github.com/torch", "description": "A deep learning library with a Lua API, supported by NYU and Facebook."}, {"name": "Velox", "url": "https://github.com/amplab/velox-modelserver", "description": "System for serving machine learning predictions.", "stars": "110"}, {"name": "Vowpal Wabbit", "url": "https://github.com/JohnLangford/vowpal_wabbit/wiki", "description": "learning system sponsored by Microsoft and Yahoo!.", "stars": "8.6k"}, {"name": "WEKA", "url": "http://www.cs.waikato.ac.nz/ml/weka/", "description": "suite of machine learning software."}, {"name": "BidMach", "url": "https://github.com/BIDData/BIDMach", "description": "CPU and GPU-accelerated Machine Learning Library.", "stars": "920"}, {"name": "Apache Hadoop Benchmarking", "url": "https://issues.apache.org/jira/browse/MAPREDUCE-3561", "description": "micro-benchmarks for testing Hadoop performances."}, {"name": "Berkeley SWIM Benchmark", "url": "https://github.com/SWIMProjectUCB/SWIM/wiki", "description": "real-world big data workload benchmark.", "stars": "129"}, {"name": "Estuary Benchmark Report", "url": "https://github.com/estuary/estuary-warehouse-benchmark", "description": "reproducible, vendor-neutral data warehouse benchmark.", "stars": "2"}, {"name": "Intel HiBench", "url": "https://github.com/intel-hadoop/HiBench", "description": "a Hadoop benchmark suite.", "stars": "1.5k"}, {"name": "PUMA Benchmarking", "url": "https://issues.apache.org/jira/browse/MAPREDUCE-5116", "description": "benchmark suite for MapReduce applications."}, {"name": "Yahoo Gridmix3", "url": "http://yahoohadoop.tumblr.com/post/98294079296/gridmix3-emulating-production-workload-for", "description": "Hadoop cluster benchmarking from Yahoo engineer team."}, {"name": "Deeplearning4j Benchmarks", "url": "https://github.com/deeplearning4j/dl4j-benchmark", "description": ""}, {"name": "UCSB", "url": "https://github.com/unum-cloud/ucsb", "description": "extended Yahoo Cloud Serving Benchmark for NoSQL databases.", "stars": "60"}, {"name": "Apache Ranger", "url": "http://ranger.apache.org/", "description": "Central security admin & fine-grained authorization for Hadoop"}, {"name": "Apache Eagle", "url": "http://eagle.apache.org/", "description": "real time monitoring solution"}, {"name": "Apache Knox Gateway", "url": "http://knox.apache.org/", "description": "single point of secure access for Hadoop clusters."}, {"name": "Apache Sentry", "url": "http://incubator.apache.org/projects/sentry.html", "description": "security module for data stored in Hadoop."}, {"name": "BDA", "url": "https://github.com/kotobukki/BDA/", "description": "The vulnerability detector for Hadoop and Spark", "stars": "104"}, {"name": "Apache Ambari", "url": "http://ambari.apache.org/", "description": "operational framework for Hadoop management."}, {"name": "Apache Bigtop", "url": "http://bigtop.apache.org//", "description": "system deployment framework for the Hadoop ecosystem."}, {"name": "Apache Helix", "url": "http://helix.apache.org/", "description": "cluster management framework."}, {"name": "Apache Mesos", "url": "http://mesos.apache.org/", "description": "cluster manager."}, {"name": "Apache Slider", "url": "https://github.com/apache/incubator-slider", "description": "is a YARN application to deploy existing distributed applications on YARN.", "stars": "77"}, {"name": "Apache Whirr", "url": "http://whirr.apache.org/", "description": "set of libraries for running cloud services."}, {"name": "Apache YARN", "url": "https://hortonworks.com/hadoop/yarn/", "description": "Cluster manager."}, {"name": "Brooklyn", "url": "http://brooklyncentral.github.io/", "description": "library that simplifies application deployment and management."}, {"name": "Buildoop", "url": "http://buildoop.github.io/", "description": "Similar to Apache BigTop based on Groovy language."}, {"name": "Cloudera HUE", "url": "http://gethue.com/", "description": "web application for interacting with Hadoop."}, {"name": "Facebook Prism", "url": "http://www.wired.com/2012/08/facebook-prism/", "description": "multi datacenters replication system."}, {"name": "Google Borg", "url": "https://www.wired.com/2013/03/google-borg-twitter-mesos/all/", "description": "job scheduling and monitoring system."}, {"name": "Google Omega", "url": "https://www.youtube.com/watch?v=0ZFMlO98Jkc", "description": "job scheduling and monitoring system."}, {"name": "Hortonworks HOYA", "url": "https://hortonworks.com/blog/introducing-hoya-hbase-on-yarn/", "description": "application that can deploy HBase cluster on YARN."}, {"name": "Kubernetes", "url": "https://kubernetes.io/", "description": "a system for automating deployment, scaling, and management of containerized applications."}, {"name": "Marathon", "url": "https://github.com/mesosphere/marathon", "description": "Mesos framework for long-running services.", "stars": "4k"}, {"name": "Linkis", "url": "https://github.com/WeBankFinTech/Linkis", "description": "Linkis helps easily connect to various back-end computation/storage engines.", "stars": "3.4k"}, {"name": "411", "url": "https://github.com/etsy/411", "description": "an web application for alert management resulting from scheduled searches into Elasticsearch.", "stars": "972"}, {"name": "Adobe spindle", "url": "https://github.com/adobe-research/spindle", "description": "Next-generation web analytics processing with Scala, Spark, and Parquet.", "stars": "331"}, {"name": "Apache Metron", "url": "http://metron.apache.org/", "description": "a platform that integrates a variety of open source big data technologies in order to offer a centralized tool for security monitoring and analysis."}, {"name": "Apache Nutch", "url": "http://nutch.apache.org/", "description": "open source web crawler."}, {"name": "Apache OODT", "url": "http://oodt.apache.org/", "description": "capturing, processing and sharing of data for NASA's scientific archives."}, {"name": "Apache Tika", "url": "https://tika.apache.org/", "description": "content analysis toolkit."}, {"name": "Argus", "url": "https://github.com/salesforce/Argus", "description": "Time series monitoring and alerting platform."}, {"name": "AthenaX", "url": "https://github.com/uber/AthenaX", "description": "a streaming analytics platform that enables users to run production-quality, large scale streaming analytics using Structured Query Language (SQL).", "stars": "1.2k"}, {"name": "Atlas", "url": "https://github.com/Netflix/atlas", "description": "a backend for managing dimensional time series data.", "stars": "3.5k"}, {"name": "Countly", "url": "https://count.ly/", "description": "open source mobile and web analytics platform, based on Node.js & MongoDB."}, {"name": "Comet", "url": "https://www.comet.com/site/", "description": "Comet provides an end-to-end model evaluation platform for AI developers, with best in class LLM evaluations, experiment tracking, and production monitoring."}, {"name": "Domino", "url": "https://www.dominodatalab.com/", "description": "Run, scale, share, and deploy models \u2014 without any infrastructure."}, {"name": "Eclipse BIRT", "url": "http://www.eclipse.org/birt/", "description": "Eclipse-based reporting system."}, {"name": "ElastAert", "url": "https://github.com/Yelp/elastalert", "description": "ElastAlert is a simple framework for alerting on anomalies, spikes, or other patterns of interest from data in ElasticSearch.", "stars": "8k"}, {"name": "Eventhub", "url": "https://github.com/Codecademy/EventHub", "description": "open source event analytics platform.", "stars": "1.3k"}, {"name": "HASH", "url": "https://hash.ai", "description": "open source simulation and visualization platform."}, {"name": "Hermes", "url": "https://github.com/allegro/hermes", "description": "asynchronous message broker built on top of Kafka.", "stars": "849"}, {"name": "Hunk", "url": "https://www.splunk.com/en_us/download/hunk.html", "description": "Splunk analytics for Hadoop."}, {"name": "Imhotep", "url": "http://opensource.indeedeng.io/imhotep/", "description": "Large scale analytics platform by indeed."}, {"name": "Indicative", "url": "https://www.indicative.com/", "description": "Web & mobile analytics tool, with data warehouse (AWS, BigQuery) integration."}, {"name": "Jupyter", "url": "https://jupyter.org/", "description": "Notebook and project application for interactive data science and scientific computing across all programming languages."}, {"name": "MADlib", "url": "http://madlib.incubator.apache.org/community/", "description": "data-processing library of an RDBMS to analyze data."}, {"name": "Kapacitor", "url": "https://github.com/influxdata/kapacitor", "description": "an open source framework for processing, monitoring, and alerting on time series data.", "stars": "2.4k"}, {"name": "Kylin", "url": "http://kylin.apache.org/", "description": "open source Distributed Analytics Engine from eBay."}, {"name": "PivotalR", "url": "https://github.com/pivotalsoftware/PivotalR", "description": "R on Pivotal HD / HAWQ and PostgreSQL.", "stars": "127"}, {"name": "Opik", "url": "https://www.comet.com/site/products/opik/", "description": "Debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards."}, {"name": "Rakam", "url": "https://github.com/rakam-io/rakam", "description": "open-source real-time custom analytics platform powered by Postgresql, Kinesis and PrestoDB.", "stars": "795"}, {"name": "Qubole", "url": "https://www.qubole.com/", "description": "auto-scaling Hadoop cluster, built-in data connectors."}, {"name": "SnappyData", "url": "https://github.com/SnappyDataInc/snappydata", "description": "a distributed in-memory data store for real-time operational analytics, delivering stream analytics, OLTP (online transaction processing) and OLAP (online analytical processing) built on Spark in a single integrated cluster.", "stars": "1k"}, {"name": "Snowplow", "url": "https://github.com/snowplow/snowplow", "description": "enterprise-strength web and event analytics, powered by Hadoop, Kinesis, Redshift and Postgres.", "stars": "7k"}, {"name": "SparkR", "url": "http://amplab-extras.github.io/SparkR-pkg/", "description": "R frontend for Spark."}, {"name": "Splunk", "url": "https://www.splunk.com/", "description": "analyzer for machine-generated data."}, {"name": "Sumo Logic", "url": "https://www.sumologic.com/", "description": "cloud based analyzer for machine-generated data."}, {"name": "Substation", "url": "https://github.com/brexhq/substation", "description": "Substation is a cloud native data pipeline and transformation toolkit written in Go.", "stars": "390"}, {"name": "Talend", "url": "http://www.talend.com/products/big-data/", "description": "unified open source environment for YARN, Hadoop, HBASE, Hive, HCatalog & Pig."}, {"name": "Apache Lucene", "url": "http://lucene.apache.org/", "description": "Search engine library."}, {"name": "Apache Solr", "url": "http://lucene.apache.org/solr/", "description": "Search platform for Apache Lucene."}, {"name": "Elassandra", "url": "https://github.com/strapdata/elassandra", "description": "is a fork of Elasticsearch modified to run on top of Apache Cassandra in a scalable and resilient peer-to-peer architecture.", "stars": "1.7k"}, {"name": "ElasticSearch", "url": "https://www.elastic.co/", "description": "Search and analytics engine based on Apache\u00a0Lucene."}, {"name": "Enigma.io", "url": "https://www.enigma.com/", "description": "Freemium robust web application for exploring, filtering, analyzing, searching and exporting massive datasets scraped from across the Web."}, {"name": "Google Caffeine", "url": "https://googleblog.blogspot.it/2010/06/our-new-search-index-caffeine.html", "description": "continuous indexing system."}, {"name": "Google Percolator", "url": "https://research.google.com/pubs/pub36726.html", "description": "continuous indexing system."}, {"name": "HBase Coprocessor", "url": "https://blogs.apache.org/hbase/entry/coprocessor_introduction", "description": "implementation of\u00a0Percolator, part of\u00a0HBase."}, {"name": "Lily HBase Indexer", "url": "http://ngdata.github.io/hbase-indexer/", "description": "quickly and easily search for any content stored in HBase."}, {"name": "LinkedIn Bobo", "url": "http://senseidb.github.io/bobo/", "description": "is a Faceted Search implementation written purely in Java, an extension to Apache Lucene."}, {"name": "LinkedIn Cleo", "url": "https://github.com/linkedin/cleo", "description": "is a flexible software library for enabling rapid development of partial, out-of-order and real-time typeahead search.", "stars": "567"}, {"name": "LinkedIn Galene", "url": "https://engineering.linkedin.com/search/did-you-mean-galene", "description": "search architecture at LinkedIn."}, {"name": "LinkedIn Zoie", "url": "https://github.com/senseidb/zoie", "description": "is a realtime search/indexing system written in Java.", "stars": "377"}, {"name": "MG4J", "url": "http://mg4j.di.unimi.it/", "description": "MG4J (Managing Gigabytes for Java) is a full-text search engine for large document collections written in Java. It is highly customisable, high-performance and provides state-of-the-art features and new research algorithms."}, {"name": "Sphinx Search Server", "url": "http://sphinxsearch.com/", "description": "fulltext search engine."}, {"name": "Vespa", "url": "http://vespa.ai/", "description": "is an engine for low-latency computation over large data sets. It stores and indexes your data such that queries, selection and processing over the data can be performed at serving time."}, {"name": "Facebook Faiss", "url": "https://github.com/facebookresearch/faiss", "description": "is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning. Faiss is written in C++ with complete wrappers for Python/numpy.", "stars": "39k"}, {"name": "Annoy", "url": "https://github.com/spotify/annoy", "description": "is a C++ library with Python bindings to search for points in space that are close to a given query point. It also creates large read-only file-based data structures that are mmapped into memory so that many processes may share the same data.", "stars": "14k"}, {"name": "Weaviate", "url": "https://github.com/semi-technologies/weaviate", "description": "Weaviate is a GraphQL-based semantic search engine with build-in (word) embeddings.", "stars": "15k"}, {"name": "Amazon RDS", "url": "https://aws.amazon.com/rds/", "description": "MySQL databases in Amazon's cloud."}, {"name": "Drizzle", "url": "http://www.drizzle.org/", "description": "evolution of MySQL 6.0."}, {"name": "Google Cloud SQL", "url": "https://cloud.google.com/sql/docs/", "description": "MySQL databases in Google's cloud."}, {"name": "MariaDB", "url": "https://mariadb.org/", "description": "enhanced, drop-in replacement for MySQL."}, {"name": "MySQL Cluster", "url": "https://www.mysql.com/products/cluster/", "description": "MySQL implementation using NDB Cluster storage engine."}, {"name": "Percona Server", "url": "https://www.percona.com/software/mysql-database/percona-server", "description": "enhanced, drop-in replacement for MySQL."}, {"name": "ProxySQL", "url": "https://github.com/renecannao/proxysql", "description": "High Performance Proxy for MySQL.", "stars": "25"}, {"name": "TokuDB", "url": "https://www.percona.com/", "description": "TokuDB is a storage engine for MySQL and MariaDB."}, {"name": "WebScaleSQL", "url": "http://webscalesql.org/", "description": "is a collaboration among engineers from several companies that face similar challenges in running MySQL at scale."}, {"name": "HadoopDB", "url": "http://db.cs.yale.edu/hadoopdb/hadoopdb.html", "description": "hybrid of MapReduce and DBMS."}, {"name": "IBM Netezza", "url": "http://www-01.ibm.com/software/data/netezza/", "description": "high-performance data warehouse appliances."}, {"name": "Postgres-XL", "url": "http://www.postgres-xl.org/", "description": "Scalable Open Source PostgreSQL-based Database Cluster."}, {"name": "RecDB", "url": "http://www-users.cs.umn.edu/~sarwat/RecDB/", "description": "Open Source Recommendation Engine Built Entirely Inside PostgreSQL."}, {"name": "Stado", "url": "http://www.stormdb.com/community/stado", "description": "open source MPP database system solely targeted at data warehousing and data mart applications."}, {"name": "Yahoo Everest", "url": "https://www.scribd.com/doc/3159239/70-Everest-PGCon-RT", "description": "multi-peta-byte database / MPP derived by PostgreSQL."}, {"name": "TimescaleDB", "url": "http://www.timescale.com/", "description": "An open-source time-series database optimized for fast ingest and complex queries"}, {"name": "PipelineDB", "url": "https://www.pipelinedb.com/", "description": "The Streaming SQL Database. An open-source relational database that runs SQL queries continuously on streams, incrementally storing results in tables"}, {"name": "Facebook McDipper", "url": "https://www.facebook.com/notes/facebook-engineering/mcdipper-a-key-value-cache-for-flash-storage/10151347090423920", "description": "key/value cache for flash storage."}, {"name": "Facebook Memcached", "url": "https://www.facebook.com/notes/facebook-engineering/scaling-memcache-at-facebook/10151411410803920", "description": "fork of Memcache."}, {"name": "Twemproxy", "url": "https://github.com/twitter/twemproxy", "description": "A fast, light-weight proxy for memcached and redis.", "stars": "12k"}, {"name": "Twitter Fatcache", "url": "https://github.com/twitter/fatcache", "description": "key/value cache for flash storage.", "stars": "1.3k"}, {"name": "Twitter Twemcache", "url": "https://github.com/twitter/twemcache", "description": "fork of Memcache.", "stars": "935"}, {"name": "Actian PSQL", "url": "http://www.actian.com/products/operational-databases/", "description": "ACID-compliant DBMS developed by Pervasive Software, optimized for embedding in applications."}, {"name": "BerkeleyDB", "url": "https://www.oracle.com/database/berkeley-db/index.html", "description": "a software library that provides a high-performance embedded database for key/value data."}, {"name": "HanoiDB", "url": "https://github.com/krestenkrab/hanoidb", "description": "Erlang LSM BTree Storage.", "stars": "310"}, {"name": "LevelDB", "url": "https://github.com/google/leveldb", "description": "a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.", "stars": "39k"}, {"name": "LMDB", "url": "https://symas.com/mdb/", "description": "ultra-fast, ultra-compact key-value embedded data store developed by Symas."}, {"name": "RocksDB", "url": "http://rocksdb.org/", "description": "embeddable persistent key-value store for fast storage based on LevelDB."}, {"name": "BIME Analytics", "url": "https://www.bimeanalytics.com/?lang=en", "description": "business intelligence platform in the cloud."}, {"name": "Blazer", "url": "https://github.com/ankane/blazer", "description": "business intelligence made simple.", "stars": "4.8k"}, {"name": "Chartio", "url": "https://chartio.com", "description": "lean business intelligence platform to visualize and explore your data."}, {"name": "Count", "url": "https://count.co", "description": "notebook-based anlytics and visualisation platform using SQL or drag-and-drop."}, {"name": "datapine", "url": "https://www.datapine.com/", "description": "self-service business intelligence tool in the cloud."}, {"name": "Dekart", "url": "https://dekart.xyz/", "description": "Large scale geospatial analytics for Google BigQuery based on Kepler.gl."}, {"name": "GoodData", "url": "https://www.gooddata.com/", "description": "platform for data products and embedded analytics."}, {"name": "Jaspersoft", "url": "https://www.jaspersoft.com/", "description": "powerful business intelligence suite."}, {"name": "Jedox Palo", "url": "https://www.jedox.com/en/", "description": "customisable Business Intelligence platform."}, {"name": "Jethrodata", "url": "https://jethro.io/", "description": "Interactive Big Data Analytics."}, {"name": "intermix.io", "url": "https://intermix.io/", "description": "Performance Monitoring for Amazon Redshift"}, {"name": "Lightdash", "url": "https://github.com/lightdash/lightdash", "description": "The open source Looker alternative built on dbt", "stars": "5.5k"}, {"name": "Metabase", "url": "https://github.com/metabase/metabase", "description": "The simplest, fastest way to get business intelligence and analytics to everyone in your company.", "stars": "46k"}, {"name": "Microsoft", "url": "http://www.microsoft.com/en-us/server-cloud/solutions/business-intelligence/default.aspx", "description": "business intelligence software and platform."}, {"name": "Microstrategy", "url": "https://www.microstrategy.com/", "description": "software platforms for business intelligence, mobile intelligence, and network applications."}, {"name": "Numeracy", "url": "https://numeracy.co/", "description": "Fast, clean SQL client and business intelligence."}, {"name": "Pentaho", "url": "http://www.pentaho.com/", "description": "business intelligence platform."}, {"name": "Qlik", "url": "http://www.qlik.com/us/", "description": "business intelligence and analytics platform."}, {"name": "Redash", "url": "https://redash.io/", "description": "Open source business intelligence platform, supporting multiple data sources and planned queries."}, {"name": "Saiku Analytics", "url": "https://www.meteorite.bi/", "description": "Open source analytics platform."}, {"name": "Knowage", "url": "https://www.knowage-suite.com/", "description": "open source business intelligence platform. (former [SpagoBi](http://www.spagobi.org/))"}, {"name": "SparklineData SNAP", "url": "http://sparklinedata.com/", "description": "modern B.I platform powered by Apache Spark."}, {"name": "Tableau", "url": "https://www.tableau.com/", "description": "business intelligence platform."}, {"name": "Zoomdata", "url": "https://www.zoomdata.com/", "description": "Big Data Analytics."}, {"name": "Airpal", "url": "https://github.com/airbnb/airpal", "description": "Web UI for PrestoDB.", "stars": "2.8k"}, {"name": "AnyChart", "url": "http://www.anychart.com", "description": "fast, simple and flexible JavaScript (HTML5) charting library featuring pure JS API."}, {"name": "Arbor", "url": "https://github.com/samizdatco/arbor", "description": "graph visualization library using web workers and jQuery.", "stars": "2.7k"}, {"name": "Banana", "url": "https://github.com/LucidWorks/banana", "description": "visualize logs and time-stamped data stored in Solr. Port of Kibana.", "stars": "672"}, {"name": "Bloomery", "url": "https://github.com/ufukomer/bloomery", "description": "Web UI for Impala.", "stars": "18"}, {"name": "Bokeh", "url": "http://bokeh.pydata.org/en/latest/", "description": "A powerful Python interactive visualization library that targets modern web browsers for presentation, with the goal of providing elegant, concise construction of novel graphics in the style of D3.js, but also delivering this capability with high-performance interactivity over very large or streaming datasets."}, {"name": "C3", "url": "http://c3js.org/", "description": "D3-based reusable chart library"}, {"name": "CartoDB", "url": "https://github.com/CartoDB/cartodb", "description": "open-source or freemium hosting for geospatial databases with powerful front-end editing capabilities and a robust API.", "stars": "2.8k"}, {"name": "chartd", "url": "http://chartd.co/", "description": "responsive, retina-compatible charts with just an img tag."}, {"name": "Chart.js", "url": "http://www.chartjs.org/", "description": "open source HTML5 Charts visualizations."}, {"name": "Chartist.js", "url": "https://github.com/gionkunz/chartist-js", "description": "another open source HTML5 Charts visualization.", "stars": "97"}, {"name": "Crossfilter", "url": "http://square.github.io/crossfilter/", "description": "JavaScript library for exploring large multivariate datasets in the browser. Works well with dc.js and d3.js."}, {"name": "Cubism", "url": "https://github.com/square/cubism", "description": "JavaScript library for time series visualization.", "stars": "4.9k"}, {"name": "Cytoscape", "url": "http://cytoscape.github.io/", "description": "JavaScript library for visualizing complex networks."}, {"name": "DC.js", "url": "http://dc-js.github.io/dc.js/", "description": "Dimensional charting built to work natively with crossfilter rendered using d3.js. Excellent for connecting charts/additional metadata to hover events in D3."}, {"name": "D3", "url": "https://d3js.org/", "description": "javaScript library for manipulating documents."}, {"name": "D3.compose", "url": "https://github.com/CSNW/d3.compose", "description": "Compose complex, data-driven visualizations from reusable charts and components.", "stars": "696"}, {"name": "D3Plus", "url": "http://d3plus.org", "description": "A fairly robust set of reusable charts and styles for d3.js."}, {"name": "Dash", "url": "https://github.com/plotly/dash", "description": "Analytical Web Apps for Python, R, Julia, and Jupyter. Built on top of plotly, no JS required", "stars": "24k"}, {"name": "Dekart", "url": "https://dekart.xyz/", "description": "Large scale geospatial analytics for Google BigQuery based on Kepler.gl."}, {"name": "DevExtreme React Chart", "url": "https://devexpress.github.io/devextreme-reactive/react/chart/", "description": "High-performance plugin-based React chart for Bootstrap and Material Design."}, {"name": "Echarts", "url": "https://github.com/ecomfe/echarts", "description": "Baidus enterprise charts.", "stars": "66k"}, {"name": "Envisionjs", "url": "https://github.com/HumbleSoftware/envisionjs", "description": "dynamic HTML5 visualization.", "stars": "1.6k"}, {"name": "FnordMetric", "url": "https://metrictools.org/", "description": "write SQL queries that return SVG charts rather than tables"}, {"name": "Frappe Charts", "url": "https://frappe.io/charts", "description": "GitHub-inspired simple and modern SVG charts for the web with zero dependencies."}, {"name": "Freeboard", "url": "https://github.com/Freeboard/freeboard", "description": "pen source real-time dashboard builder for IOT and other web mashups.", "stars": "6.5k"}, {"name": "Gephi", "url": "https://github.com/gephi/gephi", "description": "An award-winning open-source platform for visualizing and manipulating large graphs and network connections. It's like Photoshop, but for graphs. Available for Windows and Mac OS X.", "stars": "6.4k"}, {"name": "Google Charts", "url": "https://developers.google.com/chart/", "description": "simple charting API."}, {"name": "Grafana", "url": "https://grafana.com/", "description": "graphite dashboard frontend, editor and graph composer."}, {"name": "Graphite", "url": "http://graphiteapp.org/", "description": "scalable Realtime Graphing."}, {"name": "Highcharts", "url": "https://www.highcharts.com/", "description": "simple and flexible charting API."}, {"name": "IPython", "url": "http://ipython.org/", "description": "provides a rich architecture for interactive computing."}, {"name": "Kibana", "url": "https://www.elastic.co/products/kibana", "description": "visualize logs and time-stamped data"}, {"name": "Lumify", "url": "http://lumify.io/", "description": "open source big data analysis and visualization platform"}, {"name": "Matplotlib", "url": "https://github.com/matplotlib/matplotlib", "description": "plotting with Python.", "stars": "22k"}, {"name": "Metricsgraphic.js", "url": "https://metricsgraphicsjs.org/", "description": "a library built on top of D3 that is optimized for time-series data"}, {"name": "NVD3", "url": "http://nvd3.org/", "description": "chart components for d3.js."}, {"name": "Peity", "url": "https://github.com/benpickles/peity", "description": "Progressive SVG bar, line and pie charts.", "stars": "4.2k"}, {"name": "Plot.ly", "url": "https://plot.ly/", "description": "Easy-to-use web service that allows for rapid creation of complex charts, from heatmaps to histograms. Upload data to create and style charts with Plotly's online spreadsheet. Fork others' plots."}, {"name": "Plotly.js", "url": "https://github.com/plotly/plotly.js", "description": "", "stars": "18k"}, {"name": "Recline", "url": "https://github.com/okfn/recline", "description": "simple but powerful library for building data applications in pure Javascript and HTML.", "stars": "2.3k"}, {"name": "Redash", "url": "https://github.com/getredash/redash", "description": "open-source platform to query and visualize data.", "stars": "28k"}, {"name": "ReCharts", "url": "http://recharts.org/", "description": "A composable charting library built on React components"}, {"name": "Shiny", "url": "http://shiny.rstudio.com/", "description": "a web application framework for R."}, {"name": "Sigma.js", "url": "https://github.com/jacomyal/sigma.js", "description": "JavaScript library dedicated to graph drawing.", "stars": "12k"}, {"name": "Superset", "url": "https://github.com/apache/incubator-superset", "description": "a data exploration platform designed to be visual, intuitive and interactive, making it easy to slice, dice and visualize data and perform analytics at the speed of thought.", "stars": "70k"}, {"name": "Vega", "url": "https://github.com/vega/vega", "description": "a visualization grammar.", "stars": "12k"}, {"name": "Zeppelin", "url": "https://github.com/ZEPL/zeppelin", "description": "a notebook-style collaborative data analysis.", "stars": "406"}, {"name": "Zing Charts", "url": "https://www.zingchart.com/", "description": "JavaScript charting library for big data."}, {"name": "DataSphere Studio", "url": "https://github.com/WeBankFinTech/DataSphereStudio", "description": "one-stop data application development management portal.", "stars": "3.3k"}, {"name": "Apache Edgent (Incubating)", "url": "http://edgent.apache.org/", "description": "a programming model and micro-kernel style runtime that can be embedded in gateways and small footprint edge devices enabling local, real-time, analytics on the edge devices."}, {"name": "Azure IoT Hub", "url": "https://azure.microsoft.com/en-us/services/iot-hub/", "description": "Cloud-based bi-directional monitoring and messaging hub"}, {"name": "TempoIQ", "url": "https://www.tempoiq.com/", "description": "Cloud-based sensor analytics."}, {"name": "2lemetry", "url": "http://2lemetry.com/", "description": "Platform for Internet of things."}, {"name": "Pubnub", "url": "https://www.pubnub.com/", "description": "Data stream network"}, {"name": "ThingWorx", "url": "https://www.thingworx.com/", "description": "Rapid development and connection of intelligent systems"}, {"name": "IFTTT", "url": "https://ifttt.com/", "description": "If this then that"}, {"name": "Evrything", "url": "https://evrythng.com/", "description": "Making products smart"}, {"name": "NetLytics", "url": "https://github.com/marty90/netlytics/", "description": "Analytics platform to process network data on Spark.", "stars": "9"}, {"name": "Ably", "url": "https://ably.com/", "description": "Pub/sub messaging platform for IoT"}, {"name": "Big Data Benchmark", "url": "https://amplab.cs.berkeley.edu/benchmark/", "description": "Benchmark of Redshift, Hive, Shark, Impala and Stiger/Tez."}, {"name": "NoSQL Comparison", "url": "https://kkovacs.eu/cassandra-vs-mongodb-vs-couchdb-vs-redis", "description": "Cassandra vs MongoDB vs CouchDB vs Redis vs Riak vs HBase vs Couchbase vs Neo4j vs Hypertable vs ElasticSearch vs Accumulo vs VoltDB vs Scalaris comparison."}, {"name": "Monitoring Kafka performance", "url": "https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics?ref=awesome", "description": "Guide to monitoring Apache Kafka, including native methods for metrics collection."}, {"name": "Monitoring Hadoop performance", "url": "https://www.datadoghq.com/blog/monitor-hadoop-metrics?ref=awesome", "description": "Guide to monitoring Hadoop, with an overview of Hadoop architecture, and native methods for metrics collection."}, {"name": "Monitoring Cassandra performance", "url": "https://www.datadoghq.com/blog/how-to-monitor-cassandra-performance-metrics/?ref=awesome", "description": "Guide to monitoring Cassandra, including native methods for metrics collection."}], "notes": []}, {"name": "Bigdata \u2014 2015 - 2016", "entries": [{"name": "2015", "url": "http://www.vldb.org/pvldb/vol8/p1804-ching.pdf", "description": "**Facebook** - One Trillion Edges: Graph Processing at Facebook-Scale."}], "notes": []}, {"name": "Bigdata \u2014 2013 - 2014", "entries": [{"name": "2014", "url": "http://infolab.stanford.edu/~ullman/mmds/book.pdf", "description": "**Stanford** - Mining of Massive Datasets."}, {"name": "2013", "url": "https://amplab.cs.berkeley.edu/wp-content/uploads/2013/03/eurosys13-paper83.pdf", "description": "**AMPLab** - Presto: Distributed Machine Learning and Graph Processing with Sparse Matrices."}, {"name": "2013", "url": "https://amplab.cs.berkeley.edu/wp-content/uploads/2013/01/dmx1.pdf", "description": "**AMPLab** - MLbase: A Distributed Machine-learning System."}, {"name": "2013", "url": "https://amplab.cs.berkeley.edu/wp-content/uploads/2013/02/shark_sigmod2013.pdf", "description": "**AMPLab** - Shark: SQL and Rich Analytics at Scale."}, {"name": "2013", "url": "https://amplab.cs.berkeley.edu/wp-content/uploads/2013/05/grades-graphx_with_fonts.pdf", "description": "**AMPLab** - GraphX: A Resilient Distributed Graph System on Spark."}, {"name": "2013", "url": "http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf", "description": "**Google** - HyperLogLog in Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm."}, {"name": "2013", "url": "http://research.microsoft.com/pubs/200169/now-vldb.pdf", "description": "**Microsoft** - Scalable Progressive Analytics on Big Data in the Cloud."}, {"name": "2013", "url": "http://static.druid.io/docs/druid.pdf", "description": "**Metamarkets** - Druid: A Real-time Analytical Data Store."}, {"name": "2013", "url": "http://db.disi.unitn.eu/pages/VLDBProgram/pdf/industry/p764-rae.pdf", "description": "**Google** - Online, Asynchronous Schema Change in F1."}, {"name": "2013", "url": "http://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/41344.pdf", "description": "**Google** - F1: A Distributed SQL Database That Scales."}, {"name": "2013", "url": "http://db.disi.unitn.eu/pages/VLDBProgram/pdf/industry/p734-akidau.pdf", "description": "**Google** - MillWheel: Fault-Tolerant Stream Processing at Internet Scale."}, {"name": "2013", "url": "http://db.disi.unitn.eu/pages/VLDBProgram/pdf/industry/p767-wiener.pdf", "description": "**Facebook** - Scuba: Diving into Data at Facebook."}, {"name": "2013", "url": "http://db.disi.unitn.eu/pages/VLDBProgram/pdf/industry/p871-curtiss.pdf", "description": "**Facebook** - Unicorn: A System for Searching the Social Graph."}, {"name": "2013", "url": "https://www.usenix.org/system/files/conference/nsdi13/nsdi13-final170_update.pdf", "description": "**Facebook** - Scaling Memcache at Facebook."}], "notes": []}, {"name": "Bigdata \u2014 2011 - 2012", "entries": [{"name": "2012", "url": "http://vldb.org/pvldb/vol5/p1771_georgelee_vldb2012.pdf", "description": "**Twitter** - The Unified Logging Infrastructure"}, {"name": "2012", "url": "https://amplab.cs.berkeley.edu/wp-content/uploads/2013/04/blinkdb_vldb12_demo.pdf", "description": "**AMPLab** - Blink and It\u2019s Done: Interactive Queries on Very Large Data."}, {"name": "2012", "url": "https://www.usenix.org/system/files/login/articles/zaharia.pdf", "description": "**AMPLab** - Fast and Interactive Analytics over Hadoop Data with Spark."}, {"name": "2012", "url": "https://amplab.cs.berkeley.edu/wp-content/uploads/2012/03/mod482-xin1.pdf", "description": "**AMPLab** - Shark: Fast Data Analysis Using Coarse-grained Distributed Memory."}, {"name": "2012", "url": "https://www.usenix.org/legacy/event/nsdi11/tech/full_papers/Bolosky.pdf", "description": "**Microsoft** - Paxos Replicated State Machines as the Basis of a High-Performance Data Store."}, {"name": "2012", "url": "http://research.microsoft.com/pubs/178045/ppaoxs-paper29.pdf", "description": "**Microsoft** - Paxos Made Parallel."}, {"name": "2012", "url": "https://arxiv.org/pdf/1203.5485.pdf", "description": "**AMPLab** - BlinkDB: Queries with Bounded Errors and Bounded Response Times on Very Large Data."}, {"name": "2012", "url": "http://vldb.org/pvldb/vol5/p1436_alexanderhall_vldb2012.pdf", "description": "**Google** - Processing a trillion cells per mouse click."}, {"name": "2012", "url": "http://static.googleusercontent.com/media/research.google.com/en//archive/spanner-osdi2012.pdf", "description": "**Google** - Spanner: Google\u2019s Globally-Distributed Database."}, {"name": "2011", "url": "https://amplab.cs.berkeley.edu/wp-content/uploads/2011/06/euro118-ananthanarayanan.pdf", "description": "**AMPLab** - Scarlett: Coping with Skewed Popularity Content in MapReduce Clusters."}, {"name": "2011", "url": "https://amplab.cs.berkeley.edu/wp-content/uploads/2011/06/Mesos-A-Platform-for-Fine-Grained-Resource-Sharing-in-the-Data-Center.pdf", "description": "**AMPLab** - Mesos: A Platform for Fine-Grained Resource Sharing in the Data Center."}, {"name": "2011", "url": "http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36971.pdf", "description": "**Google** - Megastore: Providing Scalable, Highly Available Storage for Interactive Services."}], "notes": []}, {"name": "Bigdata \u2014 2001 - 2010", "entries": [{"name": "2010", "url": "https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Beaver.pdf", "description": "**Facebook** - Finding a needle in Haystack: Facebook\u2019s photo storage."}, {"name": "2010", "url": "https://amplab.cs.berkeley.edu/wp-content/uploads/2011/06/Spark-Cluster-Computing-with-Working-Sets.pdf", "description": "**AMPLab** - Spark: Cluster Computing with Working Sets."}, {"name": "2010", "url": "http://kowshik.github.io/JPregel/pregel_paper.pdf", "description": "**Google** - Pregel: A System for Large-Scale Graph Processing."}, {"name": "2010", "url": "http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36726.pdf", "description": "**Google** - Large-scale Incremental Processing Using Distributed Transactions and notifications base of Percolator and Caffeine."}, {"name": "2010", "url": "http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36632.pdf", "description": "**Google** - Dremel: Interactive Analysis of Web-Scale Datasets."}, {"name": "2010", "url": "http://leoneu.github.io/", "description": "**Yahoo** - S4: Distributed Stream Computing Platform."}, {"name": "2009", "url": "http://www.cs.umd.edu/~abadi/papers/hadoopdb.pdf", "description": "HadoopDB: An Architectural Hybrid of MapReduce and DBMS Technologies for Analytical Workloads."}, {"name": "2008", "url": "https://cwiki.apache.org/confluence/download/attachments/120729877/chukwa_cca08.pdf?version=1\\&modificationDate=1562667399000\\&api=v2", "description": "**AMPLab** - Chukwa: A large-scale monitoring system."}, {"name": "2007", "url": "http://www.read.seas.harvard.edu/~kohler/class/cs239-w08/decandia07dynamo.pdf", "description": "**Amazon** - Dynamo: Amazon\u2019s Highly Available Key-value Store."}, {"name": "2006", "url": "http://static.googleusercontent.com/media/research.google.com/en//archive/chubby-osdi06.pdf", "description": "**Google** - The Chubby lock service for loosely-coupled distributed systems."}, {"name": "2006", "url": "http://static.googleusercontent.com/external_content/untrusted_dlcp/research.google.com/en//archive/bigtable-osdi06.pdf", "description": "**Google** - Bigtable: A Distributed Storage System for Structured Data."}, {"name": "2004", "url": "http://static.googleusercontent.com/media/research.google.com/en//archive/mapreduce-osdi04.pdf", "description": "**Google** - MapReduce: Simplied Data Processing on Large Clusters."}, {"name": "2003", "url": "http://static.googleusercontent.com/media/research.google.com/en//archive/gfs-sosp2003.pdf", "description": "**Google** - The Google File System."}, {"name": "Spark in Motion", "url": "https://www.manning.com/livevideo/spark-in-motion", "description": "Spark in Motion teaches you how to use Spark for batch and streaming data analytics."}, {"name": "Machine Learning, Data Science and Deep Learning with Python", "url": "https://www.manning.com/livevideo/machine-learning-data-science-and-deep-learning-with-python", "description": "LiveVideo tutorial that covers machine learning, Tensorflow, artificial intelligence, and neural networks."}, {"name": "Data warehouse schema design - dimensional modeling and star schema", "url": "https://snir.dev/talks/data-warehouse-schema-design", "description": "Introduction to schema design for data warehouse using the star schema method."}, {"name": "Elasticsearch 7 and Elastic Stack", "url": "https://www.manning.com/livevideo/elasticsearch-7-and-elastic-stack", "description": "LiveVideo tutorial that covers searching, analyzing, and visualizing big data on a cluster with Elasticsearch, Logstash, Beats, Kibana, and more."}, {"name": "Data Science at Scale with Python and Dask", "url": "https://www.manning.com/books/data-science-at-scale-with-python-and-dask", "description": "Data Science at Scale with Python and Dask teaches you how to build distributed data projects that can handle huge amounts of data."}, {"name": "Streaming Data", "url": "https://www.manning.com/books/streaming-data", "description": "Streaming Data introduces the concepts and requirements of streaming and real-time data systems."}, {"name": "Storm Applied", "url": "https://www.manning.com/books/storm-applied", "description": "Storm Applied is a practical guide to using Apache Storm for the real-world tasks associated with processing and analyzing real-time data streams."}, {"name": "Fundamentals of Stream Processing: Application Design, Systems, and Analytics", "url": "http://www.cambridge.org/us/academic/subjects/engineering/communications-and-signal-processing/fundamentals-stream-processing-application-design-systems-and-analytics", "description": "This comprehensive, hands-on guide combining the fundamental building blocks and emerging research in stream processing is ideal for application designers, system builders, analytic developers, as well as students and researchers in the field."}, {"name": "Stream Data Processing: A Quality of Service Perspective", "url": "http://www.springer.com/us/book/9780387710020", "description": "Presents a new paradigm suitable for stream and complex event processing."}, {"name": "Unified Log Processing", "url": "https://www.manning.com/books/event-streams-in-action", "description": "Unified Log Processing is a practical guide to implementing a unified log of event streams (Kafka or Kinesis) in your business"}, {"name": "Kafka Streams in Action", "url": "https://www.manning.com/books/kafka-streams-in-action", "description": "Kafka Streams in Action teaches you everything you need to know to implement stream processing on data flowing into your Kafka platform, allowing you to focus on getting more from your data without sacrificing time or effort."}, {"name": "Big Data", "url": "https://www.manning.com/books/big-data", "description": "Big Data teaches you to build big data systems using an architecture that takes advantage of clustered hardware along with new tools designed specifically to capture and analyze web-scale data."}, {"name": "Spark in Action", "url": "https://www.manning.com/books/spark-in-action", "description": ""}, {"name": "Kafka in Action", "url": "https://www.manning.com/books/kafka-in-action", "description": "Kafka in Action is a fast-paced introduction to every aspect of working with Kafka you need to really reap its benefits."}, {"name": "Fusion in Action", "url": "https://www.manning.com/books/fusion-in-action", "description": "Fusion in Action teaches you to build a full-featured data analytics pipeline, including document and data search and distributed data clustering."}, {"name": "Reactive Data Handling", "url": "https://www.manning.com/books/reactive-data-handling", "description": "Reactive Data Handling is a collection of five hand-picked chapters, selected by Manuel Bernhardt, that introduce you to building reactive applications capable of handling real-time processing with large data loads--free eBook!"}, {"name": "Azure Data Engineering", "url": "https://www.manning.com/books/azure-data-engineering", "description": "A book about data engineering in general and the Azure platform specifically"}, {"name": "Grokking Streaming Systems", "url": "https://www.manning.com/books/grokking-streaming-systems", "description": "Grokking Streaming Systems helps you unravel what streaming systems are, how they work, and whether they\u2019re right for your business. Written to be tool-agnostic, you\u2019ll be able to apply what you learn no matter which framework you choose."}, {"name": "Distributed Systems for fun and profit", "url": "http://book.mixu.net/distsys/", "description": "Theory of distributed systems. Include parts about time and ordering, replication and impossibility results."}, {"name": "Graph-Powered Machine Learning", "url": "https://www.manning.com/books/graph-powered-machine-learning", "description": "Alessandro Negro. Combine graph theory and models to improve machine learning projects"}], "notes": []}, {"name": "Bigdata \u2014 Data Visualization", "entries": [{"name": "The beauty of data visualization", "url": "https://www.youtube.com/watch?v=5Zg-C8AAIGg", "description": ""}, {"name": "Designing Data Visualizations with Noah Iliinsky", "url": "https://www.youtube.com/watch?v=R-oiKt7bUU8", "description": ""}, {"name": "Hans Rosling's 200 Countries, 200 Years, 4 Minutes", "url": "https://www.youtube.com/watch?v=jbkSRLYSojo", "description": ""}, {"name": "Ice Bucket Challenge Data Visualization", "url": "https://www.youtube.com/watch?v=qTEchen97rQ", "description": ""}, {"name": "Google Bigtable", "url": "https://github.com/zrosenbauer/awesome-bigtable", "description": "", "stars": "54"}], "notes": []}], "total_entries": 615}, {"name": "Data Engineering", "subcategories": [{"name": "Data Engineering", "entries": [{"name": "Databases", "url": "#databases", "description": ""}, {"name": "Data Comparison", "url": "#data-comparison", "description": ""}, {"name": "Data Ingestion", "url": "#data-ingestion", "description": ""}, {"name": "File System", "url": "#file-system", "description": ""}, {"name": "Serialization format", "url": "#serialization-format", "description": ""}, {"name": "Stream Processing", "url": "#stream-processing", "description": ""}, {"name": "Batch Processing", "url": "#batch-processing", "description": ""}, {"name": "Charts and Dashboards", "url": "#charts-and-dashboards", "description": ""}, {"name": "Workflow", "url": "#workflow", "description": ""}, {"name": "Data Lake Management", "url": "#data-lake-management", "description": ""}, {"name": "ELK Elastic Logstash Kibana", "url": "#elk-elastic-logstash-kibana", "description": ""}, {"name": "Docker", "url": "#docker", "description": ""}, {"name": "Datasets", "url": "#datasets", "description": ""}, {"name": "Monitoring", "url": "#monitoring", "description": ""}, {"name": "Profiling", "url": "#profiling", "description": ""}, {"name": "Testing", "url": "#testing", "description": ""}, {"name": "Community", "url": "#community", "description": ""}, {"name": "datacompy", "url": "https://github.com/capitalone/datacompy", "description": "A Python library that facilitates the comparison of two DataFrames in Pandas, Polars, Spark and more. The library goes beyond basic equality checks by providing detailed insights into discrepancies at both row and column levels.", "stars": "632"}, {"name": "dvt", "url": "https://github.com/GoogleCloudPlatform/professional-services-data-validator", "description": "Data Validation Tool compares data from source and target tables to ensure that they match. It provides column validation, row validation, schema validation, custom query validation, and ad hoc SQL exploration.", "stars": "493"}, {"name": "koala-diff", "url": "https://github.com/godalida/koala-diff", "description": "A high-performance Python library for comparing large datasets (CSV, Parquet) locally using Rust and Polars. It features zero-copy streaming to prevent OOM errors and generates interactive HTML data quality reports.", "stars": "4"}, {"name": "everyrow", "url": "https://github.com/futuresearch/everyrow-sdk", "description": "AI-powered data operations SDK for Python. Semantic deduplication, fuzzy table merging, and intelligent row ranking using LLM agents.", "stars": "16"}, {"name": "ingestr", "url": "https://github.com/bruin-data/ingestr", "description": "CLI tool to copy data between databases with a single command. Supports 50+ sources including PostgreSQL, MySQL, MongoDB, Salesforce, Shopify to any data warehouse.", "stars": "3.4k"}, {"name": "Kafka", "url": "https://kafka.apache.org/", "description": "Publish-subscribe messaging rethought as a distributed commit log."}, {"name": "AWS Kinesis", "url": "https://aws.amazon.com/kinesis/", "description": "A fully managed, cloud-based service for real-time data processing over large, distributed data streams."}, {"name": "RabbitMQ", "url": "https://www.rabbitmq.com/", "description": "Robust messaging for applications."}, {"name": "dlt", "url": "https://www.dlthub.com", "description": "A fast\\&simple pipeline building library for Python data devs, runs in notebooks, cloud functions, airflow, etc."}, {"name": "FluentD", "url": "https://www.fluentd.org", "description": "An open source data collector for unified logging layer."}, {"name": "Embulk", "url": "https://www.embulk.org", "description": "An open source bulk data loader that helps data transfer between various databases, storages, file formats, and cloud services."}, {"name": "Apache Sqoop", "url": "https://sqoop.apache.org", "description": "A tool designed for efficiently transferring bulk data between Apache Hadoop and structured datastores such as relational databases."}, {"name": "Heka", "url": "https://github.com/mozilla-services/heka", "description": "Data Acquisition and Processing Made Easy. Deprecated.", "stars": "3.5k"}, {"name": "Gobblin", "url": "https://github.com/apache/incubator-gobblin", "description": "Universal data ingestion framework for Hadoop from LinkedIn.", "stars": "2.3k"}, {"name": "Nakadi", "url": "https://nakadi.io", "description": "An open source event messaging platform that provides a REST API on top of Kafka-like queues."}, {"name": "Pravega", "url": "https://www.pravega.io", "description": "Provides a new storage abstraction - a stream - for continuous and unbounded data."}, {"name": "Apache Pulsar", "url": "https://pulsar.apache.org/", "description": "An open-source distributed pub-sub messaging system."}, {"name": "AWS Data Wrangler", "url": "https://github.com/awslabs/aws-data-wrangler", "description": "Utility belt to handle data on AWS.", "stars": "4.1k"}, {"name": "Airbyte", "url": "https://airbyte.io/", "description": "Open-source data integration for modern data teams."}, {"name": "Artie", "url": "https://www.artie.com/", "description": "Real-time data ingestion tool leveraging change data capture."}, {"name": "Sling", "url": "https://slingdata.io/", "description": "CLI data integration tool specialized in moving data between databases, as well as storage systems."}, {"name": "Meltano", "url": "https://meltano.com/", "description": "CLI & code-first ELT."}, {"name": "Google Sheets ETL", "url": "https://github.com/fulldecent/google-sheets-etl", "description": "Live import all your Google Sheets to your data warehouse.", "stars": "22"}, {"name": "CsvPath Framework", "url": "https://www.csvpath.org/", "description": "A delimited data preboarding framework that fills the gap between MFT and the data lake."}, {"name": "Estuary Flow", "url": "https://estuary.dev", "description": "No/low-code data pipeline platform that handles both batch and real-time data ingestion."}, {"name": "db2lake", "url": "https://github.com/bahador-r/db2lake", "description": "Lightweight Node.js ETL framework for databases \u2192 data lakes/warehouses.", "stars": "2"}, {"name": "Kreuzberg", "url": "https://github.com/kreuzberg-dev/kreuzberg", "description": "Polyglot document intelligence library with a Rust core and bindings for Python, TypeScript, Go, and more. Extracts text, tables, and metadata from 62+ document formats for data pipeline ingestion.", "stars": "5.8k"}, {"name": "HDFS", "url": "https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html", "description": "A distributed file system designed to run on commodity hardware."}, {"name": "AWS S3", "url": "https://aws.amazon.com/s3/", "description": "Object storage built to retrieve any amount of data from anywhere."}, {"name": "Alluxio", "url": "https://www.alluxio.org/", "description": "A memory-centric distributed storage system enabling reliable data sharing at memory-speed across cluster frameworks, such as Spark and MapReduce."}, {"name": "CEPH", "url": "https://ceph.com/", "description": "A unified, distributed storage system designed for excellent performance, reliability, and scalability."}, {"name": "JuiceFS", "url": "https://github.com/juicedata/juicefs", "description": "A high-performance Cloud-Native file system driven by object storage for large-scale data storage.", "stars": "13k"}, {"name": "OrangeFS", "url": "https://www.orangefs.org/", "description": "Orange File System is a branch of the Parallel Virtual File System."}, {"name": "SnackFS", "url": "https://github.com/tuplejump/snackfs-release", "description": "A bite-sized, lightweight HDFS compatible file system built over Cassandra.", "stars": "14"}, {"name": "GlusterFS", "url": "https://www.gluster.org/", "description": "Gluster Filesystem."}, {"name": "XtreemFS", "url": "https://www.xtreemfs.org/", "description": "Fault-tolerant distributed file system for all storage needs."}, {"name": "SeaweedFS", "url": "https://github.com/chrislusf/seaweedfs", "description": "Seaweed-FS is a simple and highly scalable distributed file system. There are two objectives: to store billions of files! to serve the files fast! Instead of supporting full POSIX file system semantics, Seaweed-FS choose to implement only a key\\~file mapping. Similar to the word \"NoSQL\", you can call it as \"NoFS\".", "stars": "16"}, {"name": "S3QL", "url": "https://github.com/s3ql/s3ql/", "description": "A file system that stores all its data online using storage services like Google Storage, Amazon S3, or OpenStack.", "stars": "1.2k"}, {"name": "LizardFS", "url": "https://lizardfs.com/", "description": "Software Defined Storage is a distributed, parallel, scalable, fault-tolerant, Geo-Redundant and highly available file system."}, {"name": "Apache Avro", "url": "https://avro.apache.org", "description": "Apache Avro\u2122 is a data serialization system."}, {"name": "Apache Parquet", "url": "https://parquet.apache.org", "description": "A columnar storage format available to any project in the Hadoop ecosystem, regardless of the choice of data processing framework, data model or programming language."}, {"name": "Apache ORC", "url": "https://orc.apache.org/", "description": "The smallest, fastest columnar storage for Hadoop workloads."}, {"name": "Apache Thrift", "url": "https://thrift.apache.org", "description": "The Apache Thrift software framework, for scalable cross-language services development."}, {"name": "ProtoBuf", "url": "https://github.com/protocolbuffers/protobuf", "description": "Protocol Buffers - Google's data interchange format.", "stars": "71k"}, {"name": "SequenceFile", "url": "https://wiki.apache.org/hadoop/SequenceFile", "description": "A flat file consisting of binary key/value pairs. It is extensively used in MapReduce as input/output formats."}, {"name": "Kryo", "url": "https://github.com/EsotericSoftware/kryo", "description": "A fast and efficient object graph serialization framework for Java.", "stars": "6.5k"}, {"name": "Apache Beam", "url": "https://beam.apache.org/", "description": "A unified programming model that implements both batch and streaming data processing jobs that run on many execution engines."}, {"name": "Spark Streaming", "url": "https://spark.apache.org/streaming/", "description": "Makes it easy to build scalable fault-tolerant streaming applications."}, {"name": "Apache Flink", "url": "https://flink.apache.org/", "description": "A streaming dataflow engine that provides data distribution, communication, and fault tolerance for distributed computations over data streams."}, {"name": "Apache Storm", "url": "https://storm.apache.org", "description": "A free and open source distributed realtime computation system."}, {"name": "Apache Samza", "url": "https://samza.apache.org", "description": "A distributed stream processing framework."}, {"name": "Apache NiFi", "url": "https://nifi.apache.org/", "description": "An easy to use, powerful, and reliable system to process and distribute data."}, {"name": "Apache Hudi", "url": "https://hudi.apache.org/", "description": "An open source framework for managing storage for real time processing, one of the most interesting feature is the Upsert."}, {"name": "CocoIndex", "url": "https://github.com/cocoindex-io/cocoindex", "description": "An open source ETL framework to build fresh index for AI.", "stars": "6.1k"}, {"name": "VoltDB", "url": "https://voltdb.com/", "description": "An ACID-compliant RDBMS which uses a [shared nothing architecture](https://en.wikipedia.org/wiki/Shared-nothing_architecture)."}, {"name": "PipelineDB", "url": "https://github.com/pipelinedb/pipelinedb", "description": "The Streaming SQL Database.", "stars": "2.7k"}, {"name": "Spring Cloud Dataflow", "url": "https://cloud.spring.io/spring-cloud-dataflow/", "description": "Streaming and tasks execution between Spring Boot apps."}, {"name": "Bonobo", "url": "https://www.bonobo-project.org/", "description": "A data-processing toolkit for python 3.5+."}, {"name": "Robinhood's Faust", "url": "https://github.com/faust-streaming/faust", "description": "Forever scalable event processing & in-memory durable K/V store as a library with asyncio & static typing.", "stars": "1.9k"}, {"name": "HStreamDB", "url": "https://github.com/hstreamdb/hstream", "description": "The streaming database built for IoT data storage and real-time processing.", "stars": "726"}, {"name": "Kuiper", "url": "https://github.com/emqx/kuiper", "description": "An edge lightweight IoT data analytics/streaming software implemented by Golang, and it can be run at all kinds of resource-constrained edge devices.", "stars": "1.7k"}, {"name": "Zilla", "url": "https://github.com/aklivity/zilla", "description": "- An API gateway built for event-driven architectures and streaming that supports standard protocols such as HTTP, SSE, gRPC, MQTT, and the native Kafka protocol.", "stars": "678"}, {"name": "SwimOS", "url": "https://github.com/swimos/swim-rust", "description": "A framework for building real-time streaming data processing applications that supports a wide range of ingestion sources."}, {"name": "Pathway", "url": "https://github.com/pathwaycom/pathway", "description": "Performant open-source Python ETL framework with Rust runtime, supporting 300+ data sources.", "stars": "59k"}, {"name": "Hadoop MapReduce", "url": "https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html", "description": "A software framework for easily writing applications which process vast amounts of data (multi-terabyte data-sets) - in-parallel on large clusters (thousands of nodes) - of commodity hardware in a reliable, fault-tolerant manner."}, {"name": "Spark", "url": "https://spark.apache.org/", "description": "A multi-language engine for executing data engineering, data science, and machine learning on single-node machines or clusters."}, {"name": "AWS EMR", "url": "https://aws.amazon.com/emr/", "description": "A web service that makes it easy to quickly and cost-effectively process vast amounts of data."}, {"name": "Data Mechanics", "url": "https://www.datamechanics.co", "description": "A cloud-based platform deployed on Kubernetes making Apache Spark more developer-friendly and cost-effective."}, {"name": "Tez", "url": "https://tez.apache.org/", "description": "An application framework which allows for a complex directed-acyclic-graph of tasks for processing data."}, {"name": "Bistro", "url": "https://github.com/asavinov/bistro", "description": "A light-weight engine for general-purpose data processing including both batch and stream analytics. It is based on a novel unique data model, which represents data via *functions* and processes data via *columns operations* as opposed to having only set operations in conventional approaches like MapReduce or SQL.", "stars": "8"}, {"name": "Substation", "url": "https://github.com/brexhq/substation", "description": "A cloud native data pipeline and transformation toolkit written in Go.", "stars": "389"}, {"name": "Highcharts", "url": "https://www.highcharts.com/", "description": "A charting library written in pure JavaScript, offering an easy way of adding interactive charts to your web site or web application."}, {"name": "ZingChart", "url": "https://www.zingchart.com/", "description": "Fast JavaScript charts for any data set."}, {"name": "C3.js", "url": "https://c3js.org", "description": "D3-based reusable chart library."}, {"name": "D3.js", "url": "https://d3js.org/", "description": "A JavaScript library for manipulating documents based on data."}, {"name": "SmoothieCharts", "url": "https://smoothiecharts.org", "description": "A JavaScript Charting Library for Streaming Data."}, {"name": "PyXley", "url": "https://github.com/stitchfix/pyxley", "description": "Python helpers for building dashboards using Flask and React.", "stars": "2.3k"}, {"name": "Plotly", "url": "https://github.com/plotly/dash", "description": "Flask, JS, and CSS boilerplate for interactive, web-based visualization apps in Python.", "stars": "25k"}, {"name": "Apache Superset", "url": "https://github.com/apache/incubator-superset", "description": "A modern, enterprise-ready business intelligence web application.", "stars": "71k"}, {"name": "Redash", "url": "https://redash.io/", "description": "Make Your Company Data Driven. Connect to any data source, easily visualize and share your data."}, {"name": "Metabase", "url": "https://github.com/metabase/metabase", "description": "The easy, open source way for everyone in your company to ask questions and learn from data.", "stars": "46k"}, {"name": "PyQtGraph", "url": "https://www.pyqtgraph.org/", "description": "A pure-python graphics and GUI library built on PyQt4 / PySide and numpy. It is intended for use in mathematics / scientific / engineering applications."}, {"name": "Seaborn", "url": "https://seaborn.pydata.org", "description": "A Python visualization library based on matplotlib. It provides a high-level interface for drawing attractive statistical graphics."}, {"name": "QueryGPT", "url": "https://github.com/MKY508/QueryGPT", "description": "Natural language database query interface with automatic chart generation, supporting Chinese and English queries.", "stars": "28"}, {"name": "Bruin", "url": "https://github.com/bruin-data/bruin", "description": "End-to-end data pipeline tool that combines ingestion, transformation (SQL + Python), and data quality in a single CLI. Connects to BigQuery, Snowflake, PostgreSQL, Redshift, and more. Includes VS Code extension with live previews.", "stars": "1.4k"}, {"name": "Luigi", "url": "https://github.com/spotify/luigi", "description": "A Python module that helps you build complex pipelines of batch jobs.", "stars": "19k"}, {"name": "CronQ", "url": "https://github.com/seatgeek/cronq", "description": "An application cron-like system. [Used](https://chairnerd.seatgeek.com/building-out-the-seatgeek-data-pipeline/) w/Luigi. Deprecated."}, {"name": "Cascading", "url": "https://www.cascading.org/", "description": "Java based application development platform."}, {"name": "Airflow", "url": "https://github.com/apache/airflow", "description": "A system to programmatically author, schedule, and monitor data pipelines.", "stars": "44k"}, {"name": "Azkaban", "url": "https://azkaban.github.io/", "description": "A batch workflow job scheduler created at LinkedIn to run Hadoop jobs. Azkaban resolves the ordering through job dependencies and provides an easy-to-use web user interface to maintain and track your workflows."}, {"name": "Oozie", "url": "https://oozie.apache.org/", "description": "A workflow scheduler system to manage Apache Hadoop jobs."}, {"name": "Pinball", "url": "https://github.com/pinterest/pinball", "description": "DAG based workflow manager. Job flows are defined programmatically in Python. Support output passing between jobs.", "stars": "1k"}, {"name": "Dagster", "url": "https://github.com/dagster-io/dagster", "description": "An open-source Python library for building data applications.", "stars": "15k"}, {"name": "Hamilton", "url": "https://github.com/dagworks-inc/hamilton", "description": "A lightweight library to define data transformations as a directed-acyclic graph (DAG). If you like dbt for SQL transforms, you will like Hamilton for Python processing.", "stars": "2.4k"}, {"name": "Kedro", "url": "https://kedro.readthedocs.io/en/latest/", "description": "A framework that makes it easy to build robust and scalable data pipelines by providing uniform project templates, data abstraction, configuration and pipeline assembly."}, {"name": "Dataform", "url": "https://dataform.co/", "description": "An open-source framework and web based IDE to manage datasets and their dependencies. SQLX extends your existing SQL warehouse dialect to add features that support dependency management, testing, documentation and more."}, {"name": "Census", "url": "https://getcensus.com/", "description": "A reverse-ETL tool that let you sync data from your cloud data warehouse to SaaS applications like Salesforce, Marketo, HubSpot, Zendesk, etc. No engineering favors required\u2014just SQL."}, {"name": "dbt", "url": "https://getdbt.com/", "description": "A command line tool that enables data analysts and engineers to transform data in their warehouses more effectively."}, {"name": "Kestra", "url": "https://github.com/kestra-io/kestra", "description": "Scalable, event-driven, language-agnostic orchestration and scheduling platform to manage millions of workflows declaratively in code.", "stars": "26k"}, {"name": "RudderStack", "url": "https://github.com/rudderlabs/rudder-server", "description": "A warehouse-first Customer Data Platform that enables you to collect data from every application, website and SaaS platform, and then activate it in your warehouse and business tools.", "stars": "4.4k"}, {"name": "PACE", "url": "https://github.com/getstrm/pace", "description": "An open source framework that allows you to enforce agreements on how data should be accessed, used, and transformed, regardless of the data platform (Snowflake, BigQuery, DataBricks, etc.)", "stars": "38"}, {"name": "Prefect", "url": "https://prefect.io/", "description": "An orchestration and observability platform. With it, developers can rapidly build and scale resilient code, and triage disruptions effortlessly."}, {"name": "Multiwoven", "url": "https://github.com/Multiwoven/multiwoven", "description": "The open-source reverse ETL, data activation platform for modern data teams.", "stars": "1.6k"}, {"name": "SuprSend", "url": "https://www.suprsend.com/products/workflows", "description": "Create automated workflows and logic using API's for your notification service. Add templates, batching, preferences, inapp inbox with workflows to trigger notifications directly from your data warehouse."}, {"name": "Mage", "url": "https://www.mage.ai", "description": "Open-source data pipeline tool for transforming and integrating data."}, {"name": "SQLMesh", "url": "https://sqlmesh.readthedocs.io", "description": "An open-source data transformation framework for managing, testing, and deploying SQL and Python-based data pipelines with version control, environment isolation, and automatic dependency resolution."}, {"name": "lakeFS", "url": "https://github.com/treeverse/lakeFS", "description": "An open source platform that delivers resilience and manageability to object-storage based data lakes.", "stars": "5.2k"}, {"name": "Project Nessie", "url": "https://github.com/projectnessie/nessie", "description": "A Transactional Catalog for Data Lakes with Git-like semantics. Works with Apache Iceberg tables.", "stars": "1.4k"}, {"name": "Ilum", "url": "https://ilum.cloud/", "description": "A modular Data Lakehouse platform that simplifies the management and monitoring of Apache Spark clusters across Kubernetes and Hadoop environments."}, {"name": "Gravitino", "url": "https://github.com/apache/gravitino", "description": "An open-source, unified metadata management for data lakes, data warehouses, and external catalogs.", "stars": "2.9k"}, {"name": "FlightPath Data", "url": "https://www.flightpathdata.com", "description": "FlightPath is a gateway to a data lake's bronze layer, protecting it from invalid external data file feeds as a trusted publisher."}, {"name": "docker-logstash", "url": "https://github.com/pblittle/docker-logstash", "description": "A highly configurable Logstash (1.4.4) - Docker image running Elasticsearch (1.7.0) - and Kibana (3.1.2).", "stars": "237"}, {"name": "elasticsearch-jdbc", "url": "https://github.com/jprante/elasticsearch-jdbc", "description": "JDBC importer for Elasticsearch.", "stars": "2.8k"}, {"name": "ZomboDB", "url": "https://github.com/zombodb/zombodb", "description": "PostgreSQL Extension that allows creating an index backed by Elasticsearch.", "stars": "4.7k"}, {"name": "Gockerize", "url": "https://github.com/redbooth/gockerize", "description": "Package golang service into minimal Docker containers.", "stars": "667"}, {"name": "Flocker", "url": "https://github.com/ClusterHQ/flocker", "description": "Easily manage Docker containers & their data.", "stars": "3.4k"}, {"name": "Rancher", "url": "https://rancher.com/rancher-os/", "description": "RancherOS is a 20mb Linux distro that runs the entire OS as Docker containers."}, {"name": "Kontena", "url": "https://www.kontena.io/", "description": "Application Containers for Masses."}, {"name": "Weave", "url": "https://github.com/weaveworks/weave", "description": "Weaving Docker containers into applications.", "stars": "6.6k"}, {"name": "Zodiac", "url": "https://github.com/CenturyLinkLabs/zodiac", "description": "A lightweight tool for easy deployment and rollback of dockerized applications.", "stars": "200"}, {"name": "cAdvisor", "url": "https://github.com/google/cadvisor", "description": "Analyzes resource usage and performance characteristics of running containers.", "stars": "19k"}, {"name": "Micro S3 persistence", "url": "https://github.com/figadore/micro-s3-persistence", "description": "Docker microservice for saving/restoring volume data to S3.", "stars": "14"}, {"name": "Rocker-compose", "url": "https://github.com/grammarly/rocker-compose", "description": "Docker composition tool with idempotency features for deploying apps composed of multiple containers. Deprecated.", "stars": "408"}, {"name": "Nomad", "url": "https://github.com/hashicorp/nomad", "description": "A cluster manager, designed for both long-lived services and short-lived batch processing workloads.", "stars": "16k"}, {"name": "ImageLayers", "url": "https://imagelayers.io/", "description": "Visualize Docker images and the layers that compose them."}], "notes": []}, {"name": "Data Engineering \u2014 Realtime", "entries": [{"name": "Twitter Realtime", "url": "https://developer.twitter.com/en/docs/tweets/filter-realtime/overview", "description": "The Streaming APIs give developers low latency access to Twitter's global stream of Tweet data."}, {"name": "Eventsim", "url": "https://github.com/Interana/eventsim", "description": "Event data simulator. Generates a stream of pseudo-random events from a set of users, designed to simulate web traffic.", "stars": "535"}, {"name": "Reddit", "url": "https://www.reddit.com/r/datasets/comments/3mk1vg/realtime_data_is_available_including_comments/", "description": "Real-time data is available including comments, submissions and links posted to reddit."}], "notes": []}, {"name": "Data Engineering \u2014 Data Dumps", "entries": [{"name": "GitHub Archive", "url": "https://www.gharchive.org/", "description": "GitHub's public timeline since 2011, updated every hour."}, {"name": "Common Crawl", "url": "https://commoncrawl.org/", "description": "Open source repository of web crawl data."}, {"name": "Wikipedia", "url": "https://dumps.wikimedia.org/enwiki/latest/", "description": "Wikipedia's complete copy of all wikis, in the form of Wikitext source and metadata embedded in XML. A number of raw database tables in SQL form are also available."}], "notes": []}, {"name": "Data Engineering \u2014 Prometheus", "entries": [{"name": "Prometheus.io", "url": "https://github.com/prometheus/prometheus", "description": "An open-source service monitoring system and time series database.", "stars": "63k"}, {"name": "HAProxy Exporter", "url": "https://github.com/prometheus/haproxy_exporter", "description": "Simple server that scrapes HAProxy stats and exports them via HTTP for Prometheus consumption.", "stars": "627"}], "notes": []}, {"name": "Data Engineering \u2014 Data Profiler", "entries": [{"name": "Data Profiler", "url": "https://github.com/capitalone/dataprofiler", "description": "The DataProfiler is a Python library designed to make data analysis, monitoring, and sensitive data detection easy.", "stars": "1.5k"}, {"name": "YData Profiling", "url": "https://docs.profiling.ydata.ai/latest/", "description": "A general-purpose open-source data profiler for high-level analysis of a dataset."}, {"name": "Desbordante", "url": "https://github.com/desbordante/desbordante-core", "description": "An open-source data profiler specifically focused on discovery and validation of complex patterns in data.", "stars": "465"}, {"name": "Grai", "url": "https://github.com/grai-io/grai-core/", "description": "A data catalog tool that integrates into your CI system exposing downstream impact testing of data changes. These tests prevent data changes which might break data pipelines or BI dashboards from making it to production.", "stars": "313"}, {"name": "DQOps", "url": "https://github.com/dqops/dqo", "description": "An open-source data quality platform for the whole data platform lifecycle from profiling new data sources to applying full automation of data quality monitoring.", "stars": "187"}, {"name": "DataKitchen", "url": "https://datakitchen.io/", "description": "Open Source Data Observability for end-to-end Data Journey Observability, data profiling, anomaly detection, and auto-created data quality validation tests."}, {"name": "GreatExpectation", "url": "https://greatexpectations.io/", "description": "Open Source data validation framework to manage data quality. Users can define and document \u201cexpectations\u201d rules about how data should look and behave."}, {"name": "RunSQL", "url": "https://runsql.com/", "description": "Free online SQL playground for MySQL, PostgreSQL, and SQL Server. Create database structures, run queries, and share results instantly."}, {"name": "Spark Playground", "url": "https://www.sparkplayground.com/", "description": "Write, run, and test PySpark code on Spark Playground's online compiler. Access real-world sample datasets & solve interview questions to enhance your PySpark skills for data engineering roles."}, {"name": "daffy", "url": "https://github.com/vertti/daffy/", "description": "Decorator-first DataFrame contracts/validation (columns/dtypes/constraints) at function boundaries. Supports Pandas/Polars/PyArrow/Modin.", "stars": "53"}, {"name": "Snowflake Emulator", "url": "https://github.com/nnnkkk7/snowflake-emulator", "description": "A Snowflake-compatible emulator for local development and testing.", "stars": "24"}], "notes": []}, {"name": "Data Engineering \u2014 Forums", "entries": [{"name": "/r/dataengineering", "url": "https://www.reddit.com/r/dataengineering/", "description": "News, tips, and background on Data Engineering."}, {"name": "/r/etl", "url": "https://www.reddit.com/r/ETL/", "description": "Subreddit focused on ETL."}], "notes": []}, {"name": "Data Engineering \u2014 Conferences", "entries": [{"name": "Data Council", "url": "https://www.datacouncil.ai/about", "description": "The first technical conference that bridges the gap between data scientists, data engineers and data analysts."}], "notes": []}, {"name": "Data Engineering \u2014 Podcasts", "entries": [{"name": "Data Engineering Podcast", "url": "https://www.dataengineeringpodcast.com/", "description": "The show about modern data infrastructure."}, {"name": "The Data Stack Show", "url": "https://datastackshow.com/", "description": "A show where they talk to data engineers, analysts, and data scientists about their experience around building and maintaining data infrastructure, delivering data and data products, and driving better outcomes across their businesses with data."}], "notes": []}, {"name": "Data Engineering \u2014 Books", "entries": [{"name": "Snowflake Data Engineering", "url": "https://www.manning.com/books/snowflake-data-engineering", "description": "A practical introduction to data engineering on the Snowflake cloud data platform."}, {"name": "Best Data Science Books", "url": "https://www.appliedaicourse.com/blog/data-science-books/", "description": "This blog offers a curated list of top data science books, categorized by topics and learning stages, to aid readers in building foundational knowledge and staying updated with industry trends."}, {"name": "Architecting an Apache Iceberg Lakehouse", "url": "https://www.manning.com/books/architecting-an-apache-iceberg-lakehouse", "description": "A guide to designing an Apache Iceberg lakehouse from scratch."}, {"name": "Learn AI Data Engineering in a Month of Lunches", "url": "https://www.manning.com/books/learn-ai-data-engineering-in-a-month-of-lunches", "description": "A fast, friendly guide to integrating large language models into your data workflows."}], "notes": []}], "total_entries": 170}, {"name": "Hadoop", "subcategories": [{"name": "Hadoop", "entries": [{"name": "Awesome Hadoop", "url": "#awesome-hadoop", "description": ""}, {"name": "Resources", "url": "#resources", "description": ""}, {"name": "Other Awesome Lists", "url": "#other-awesome-lists", "description": ""}, {"name": "Apache Hadoop", "url": "http://hadoop.apache.org/", "description": "Apache Hadoop"}, {"name": "Apache Hadoop Ozone", "url": "http://hadoop.apache.org/ozone/", "description": "An Object Store for Apache Hadoop"}, {"name": "Apache Tez", "url": "http://tez.apache.org/", "description": "A Framework for YARN-based, Data Processing Applications In Hadoop"}, {"name": "SpatialHadoop", "url": "http://spatialhadoop.cs.umn.edu/", "description": "SpatialHadoop is a MapReduce extension to Apache Hadoop designed specially to work with spatial data."}, {"name": "GIS Tools for Hadoop", "url": "http://esri.github.io/gis-tools-for-hadoop/", "description": "Big Data Spatial Analytics for the Hadoop Framework"}, {"name": "Elasticsearch Hadoop", "url": "https://github.com/elastic/elasticsearch-hadoop", "description": "Elasticsearch real-time search and analytics natively integrated with Hadoop. Supports Map/Reduce, Cascading, Apache Hive and Apache Pig.", "stars": "1.9k"}, {"name": "hadoopy", "url": "https://github.com/bwhite/hadoopy", "description": "Python MapReduce library written in Cython.", "stars": "243"}, {"name": "mrjob", "url": "https://github.com/Yelp/mrjob/", "description": "mrjob is a Python 2.5+ package that helps you write and run Hadoop Streaming jobs.", "stars": "2.6k"}, {"name": "pydoop", "url": "http://pydoop.sourceforge.net/", "description": "Pydoop is a package that provides a Python API for Hadoop."}, {"name": "hdfs-du", "url": "https://github.com/twitter/hdfs-du", "description": "HDFS-DU is an interactive visualization of the Hadoop distributed file system.", "stars": "228"}, {"name": "White Elephant", "url": "https://github.com/linkedin/white-elephant", "description": "Hadoop log aggregator and dashboard", "stars": "191"}, {"name": "Genie", "url": "https://github.com/Netflix/genie", "description": "Genie provides REST-ful APIs to run Hadoop, Hive and Pig jobs, and to manage multiple Hadoop resources and perform job submissions across them.", "stars": "1.6k"}, {"name": "Apache Kylin", "url": "http://kylin.incubator.apache.org/", "description": "Apache Kylin is an open source Distributed Analytics Engine from eBay Inc. that provides SQL interface and multi-dimensional analysis (OLAP) on Hadoop supporting extremely large datasets"}, {"name": "Crunch", "url": "https://github.com/jondot/crunch", "description": "Go-based toolkit for ETL and feature extraction on Hadoop", "stars": "207"}, {"name": "Apache Ignite", "url": "http://ignite.apache.org/", "description": "Distributed in-memory platform"}, {"name": "Apache Slider", "url": "http://slider.incubator.apache.org/", "description": "Apache Slider is a project in incubation at the Apache Software Foundation with the goal of making it possible and easy to deploy existing applications onto a YARN cluster."}, {"name": "Apache Twill", "url": "http://twill.incubator.apache.org/", "description": "Apache Twill is an abstraction over Apache Hadoop\u00ae YARN that reduces the complexity of developing distributed applications, allowing developers to focus more on their application logic."}, {"name": "mpich2-yarn", "url": "https://github.com/alibaba/mpich2-yarn", "description": "Running MPICH2 on Yarn", "stars": "110"}, {"name": "Apache HBase", "url": "http://hbase.apache.org", "description": "Apache HBase"}, {"name": "Apache Phoenix", "url": "http://phoenix.apache.org/", "description": "A SQL skin over HBase supporting secondary indices"}, {"name": "happybase", "url": "https://github.com/wbolster/happybase", "description": "A developer-friendly Python library to interact with Apache HBase.", "stars": "595"}, {"name": "Hannibal", "url": "https://github.com/sentric/hannibal", "description": "Hannibal is tool to help monitor and maintain HBase-Clusters that are configured for manual splitting.", "stars": "170"}, {"name": "Haeinsa", "url": "https://github.com/VCNC/haeinsa", "description": "Haeinsa is linearly scalable multi-row, multi-table transaction library for HBase", "stars": "158"}, {"name": "hindex", "url": "https://github.com/Huawei-Hadoop/hindex", "description": "Secondary Index for HBase", "stars": "588"}, {"name": "Apache Accumulo", "url": "https://accumulo.apache.org/", "description": "The Apache Accumulo\u2122 sorted, distributed key/value store is a robust, scalable, high performance data storage and retrieval system."}, {"name": "OpenTSDB", "url": "http://opentsdb.net/", "description": "The Scalable Time Series Database"}, {"name": "Apache Cassandra", "url": "http://cassandra.apache.org/", "description": ""}, {"name": "Apache Hive", "url": "http://hive.apache.org", "description": "The Apache Hive data warehouse software facilitates reading, writing, and managing large datasets residing in distributed storage using SQL"}, {"name": "Apache Phoenix", "url": "http://phoenix.apache.org", "description": ""}, {"name": "Apache HAWQ (incubating)", "url": "http://hawq.incubator.apache.org/", "description": "Apache HAWQ is a Hadoop native SQL query engine that combines the key technological advantages of MPP database with the scalability and convenience of Hadoop"}, {"name": "Lingual", "url": "http://www.cascading.org/projects/lingual/", "description": "SQL interface for Cascading (MR/Tez job generator)"}, {"name": "Apache Impala", "url": "https://impala.apache.org/", "description": "Apache Impala is an open source massively parallel processing (MPP) SQL query engine for data stored in a computer cluster running Apache Hadoop. Impala has been described as the open-source equivalent of Google F1, which inspired its development in 2012."}, {"name": "Presto", "url": "https://prestodb.io/", "description": "Distributed SQL Query Engine for Big Data. Open sourced by Facebook."}, {"name": "Apache Tajo", "url": "http://tajo.apache.org/", "description": "Data warehouse system for Apache Hadoop"}, {"name": "Apache Drill", "url": "https://drill.apache.org/", "description": "Schema-free SQL Query Engine"}, {"name": "Apache Trafodion", "url": "http://trafodion.apache.org/", "description": ""}, {"name": "Apache Calcite", "url": "http://calcite.apache.org/", "description": "A Dynamic Data Management Framework"}, {"name": "Apache Atlas", "url": "http://atlas.incubator.apache.org/", "description": "Metadata tagging & lineage capture suppoting complex business data taxonomies"}, {"name": "Apache Kudu", "url": "https://kudu.apache.org/", "description": "Kudu provides a combination of fast inserts/updates and efficient columnar scans to enable multiple real-time analytic workloads across a single storage layer, complementing HDFS and Apache HBase."}, {"name": "Confluent Schema registry for Kafka", "url": "https://github.com/confluentinc/schema-registry", "description": "Schema Registry provides a serving layer for your metadata. It provides a RESTful interface for storing and retrieving Avro schemas.", "stars": "1.8k"}, {"name": "Hortonworks Schema Registry", "url": "https://github.com/hortonworks/registry", "description": "Schema Registry is a framework to build metadata repositories.", "stars": "216"}, {"name": "Apache Oozie", "url": "http://oozie.apache.org", "description": "Apache Oozie"}, {"name": "Azkaban", "url": "http://azkaban.github.io/", "description": ""}, {"name": "Apache Falcon", "url": "http://falcon.apache.org/", "description": "Data management and processing platform"}, {"name": "Apache NiFi", "url": "http://nifi.apache.org/", "description": "A dataflow system"}, {"name": "Apache AirFlow", "url": "https://github.com/apache/incubator-airflow", "description": "Airflow is a workflow automation and scheduling system that can be used to author and manage data pipelines", "stars": "28k"}, {"name": "Luigi", "url": "http://luigi.readthedocs.org/en/latest/", "description": "Python package that helps you build complex pipelines of batch jobs"}, {"name": "Apache Flume", "url": "http://flume.apache.org", "description": "Apache Flume"}, {"name": "Suro", "url": "https://github.com/Netflix/suro", "description": "Netflix's distributed Data Pipeline", "stars": "772"}, {"name": "Apache Sqoop", "url": "http://sqoop.apache.org", "description": "Apache Sqoop"}, {"name": "Apache Kafka", "url": "http://kafka.apache.org/", "description": "Apache Kafka"}, {"name": "Gobblin from LinkedIn", "url": "https://github.com/linkedin/gobblin", "description": "Universal data ingestion framework for Hadoop", "stars": "2.1k"}, {"name": "Apache Pig", "url": "http://pig.apache.org", "description": "Apache Pig"}, {"name": "Apache DataFu", "url": "http://datafu.incubator.apache.org/", "description": "A collection of libraries for working with large-scale data in Hadoop"}, {"name": "vahara", "url": "https://github.com/thedatachef/varaha", "description": "Machine learning and natural language processing with Apache Pig", "stars": "51"}, {"name": "packetpig", "url": "https://github.com/packetloop/packetpig", "description": "Open Source Big Data Security Analytics", "stars": "301"}, {"name": "akela", "url": "https://github.com/mozilla-metrics/akela", "description": "Mozilla's utility library for Hadoop, HBase, Pig, etc.", "stars": "76"}, {"name": "seqpig", "url": "http://seqpig.sourceforge.net/", "description": "Simple and scalable scripting for large sequencing data set(ex: bioinfomation) in Hadoop"}, {"name": "Lipstick", "url": "https://github.com/Netflix/Lipstick", "description": "Pig workflow visualization tool. [Introducing Lipstick on A(pache) Pig](http://techblog.netflix.com/2013/06/introducing-lipstick-on-apache-pig.html)", "stars": "460"}, {"name": "PigPen", "url": "https://github.com/Netflix/PigPen", "description": "PigPen is map-reduce for Clojure, or distributed Clojure. It compiles to Apache Pig, but you don't need to know much about Pig to use it.", "stars": "542"}, {"name": "Kite Software Development Kit", "url": "http://kitesdk.org/", "description": "A set of libraries, tools, examples, and documentation"}, {"name": "gohadoop", "url": "https://github.com/hortonworks/gohadoop", "description": "Native go clients for Apache Hadoop YARN.", "stars": "307"}, {"name": "Hue", "url": "http://gethue.com/", "description": "A Web interface for analyzing data with Apache Hadoop."}, {"name": "Apache Zeppelin", "url": "https://zeppelin.incubator.apache.org/", "description": "A web-based notebook that enables interactive data analytics"}, {"name": "Apache Thrift", "url": "http://thrift.apache.org/", "description": ""}, {"name": "Apache Avro", "url": "http://avro.apache.org/", "description": "Apache Avro is a data serialization system."}, {"name": "Elephant Bird", "url": "https://github.com/twitter/elephant-bird", "description": "Twitter's collection of LZO and Protocol Buffer-related Hadoop, Pig, Hive, and HBase code.", "stars": "1.1k"}, {"name": "Spring for Apache Hadoop", "url": "http://projects.spring.io/spring-hadoop/", "description": ""}, {"name": "hdfs - A native go client for HDFS", "url": "https://github.com/colinmarc/hdfs", "description": "", "stars": "1.2k"}, {"name": "Oozie Eclipse Plugin", "url": "https://marketplace.eclipse.org/content/oozie-eclipse-plugin", "description": "A graphical editor for editing Apache Oozie workflows inside Eclipse."}, {"name": "snakebite", "url": "https://pypi.python.org/pypi/snakebite/", "description": "A pure python HDFS client"}, {"name": "Apache Parquet", "url": "https://parquet.apache.org/", "description": "Apache Parquet is a columnar storage format available to any project in the Hadoop ecosystem, regardless of the choice of data processing framework, data model or programming language."}, {"name": "Apache Superset (incubating)", "url": "https://superset.incubator.apache.org/", "description": "Apache Superset (incubating) is a modern, enterprise-ready business intelligence web application"}, {"name": "Schema Registry UI", "url": "https://github.com/Landoop/schema-registry-ui", "description": "Web tool for the Confluent Schema Registry in order to create / view / search / evolve / view history & configure Avro schemas of your Kafka cluster.", "stars": "398"}, {"name": "Apache Storm", "url": "http://storm.apache.org/", "description": ""}, {"name": "Apache Samza", "url": "http://samza.apache.org/", "description": ""}, {"name": "Apache Spark", "url": "http://spark.apache.org/streaming/", "description": ""}, {"name": "Apache Flink", "url": "https://flink.apache.org", "description": "Apache Flink is a platform for efficient, distributed, general-purpose data processing. It supports exactly once stream processing."}, {"name": "Apache Pulsar (incubating)", "url": "http://pulsar.incubator.apache.org/", "description": "Apache Pulsar (incubating) is a highly scalable, low latency messaging platform running on commodity hardware. It provides simple pub-sub semantics over topics, guaranteed at-least-once delivery of messages, automatic cursor management for subscribers, and cross-datacenter replication."}, {"name": "Apache Druid (incubating)", "url": "http://druid.incubator.apache.org/", "description": "A high-performance, column-oriented, distributed data store."}, {"name": "Apache Spark", "url": "http://spark.apache.org/", "description": ""}, {"name": "Spark Packages", "url": "http://spark-packages.org/", "description": "A community index of packages for Apache Spark"}, {"name": "SparkHub", "url": "https://sparkhub.databricks.com/", "description": "A community site for Apache Spark"}, {"name": "Apache Crunch", "url": "http://crunch.apache.org", "description": ""}, {"name": "Cascading", "url": "http://www.cascading.org/", "description": "Cascading is the proven application development platform for building data applications on Hadoop."}, {"name": "Apache Flink", "url": "http://flink.apache.org/", "description": "Apache Flink is a platform for efficient, distributed, general-purpose data processing."}, {"name": "Apache Apex (incubating)", "url": "http://apex.incubator.apache.org/", "description": "Enterprise-grade unified stream and batch processing engine."}, {"name": "Apache Livy (incubating)", "url": "https://livy.incubator.apache.org/", "description": "Apache Livy (incubating) is web service that exposes a REST interface for managing long running Apache Spark contexts in your cluster. With Livy, new applications can be built on top of Apache Spark that require fine grained interaction with many Spark contexts."}, {"name": "Apache Bigtop", "url": "http://bigtop.apache.org/", "description": "Apache Bigtop: Packaging and tests of the Apache Hadoop ecosystem"}, {"name": "Apache Ambari", "url": "http://ambari.apache.org/", "description": "Apache Ambari"}, {"name": "Ganglia Monitoring System", "url": "http://ganglia.sourceforge.net/", "description": ""}, {"name": "ankush", "url": "https://github.com/impetus-opensource/ankush", "description": "A big data cluster management tool that creates and manages clusters of different technologies.", "stars": "21"}, {"name": "Apache Zookeeper", "url": "http://zookeeper.apache.org/", "description": "Apache Zookeeper"}, {"name": "Apache Curator", "url": "http://curator.apache.org/", "description": "ZooKeeper client wrapper and rich ZooKeeper framework"}, {"name": "inviso", "url": "https://github.com/Netflix/inviso", "description": "Inviso is a lightweight tool that provides the ability to search for Hadoop jobs, visualize the performance, and view cluster utilization.", "stars": "200"}, {"name": "Logit.io", "url": "https://logit.io/", "description": "Send logs from Hadoop to Elasticsearch for monitoring and alerting."}, {"name": "ElasticSearch", "url": "https://www.elastic.co/", "description": ""}, {"name": "Apache Solr", "url": "http://lucene.apache.org/solr/", "description": "Apache Solr is an open source search platform built upon a Java library called Lucene."}, {"name": "Banana", "url": "https://github.com/LucidWorks/banana", "description": "Kibana port for Apache Solr", "stars": "667"}, {"name": "Apache Nutch", "url": "http://nutch.apache.org/", "description": "Apache Nutch is a highly extensible and scalable open source web crawler software project."}, {"name": "Apache Ranger", "url": "http://ranger.incubator.apache.org/", "description": "Ranger is a framework to enable, monitor and manage comprehensive data security across the Hadoop platform."}, {"name": "Apache Sentry", "url": "https://sentry.incubator.apache.org/", "description": "An authorization module for Hadoop"}, {"name": "Apache Knox Gateway", "url": "https://knox.apache.org/", "description": "A REST API Gateway for interacting with Hadoop clusters."}, {"name": "Big Data Benchmark", "url": "https://amplab.cs.berkeley.edu/benchmark/", "description": ""}, {"name": "HiBench", "url": "https://github.com/intel-hadoop/HiBench", "description": "", "stars": "1.3k"}, {"name": "YCSB", "url": "https://github.com/brianfrankcooper/YCSB", "description": "The Yahoo! Cloud Serving Benchmark (YCSB) is an open-source specification and program suite for evaluating retrieval and maintenance capabilities of computer programs. It is often used to compare relative performance of NoSQL database management systems.", "stars": "4.3k"}, {"name": "Apache Mahout", "url": "http://mahout.apache.org", "description": ""}, {"name": "Oryx 2", "url": "https://github.com/OryxProject/oryx", "description": "Lambda architecture on Spark, Kafka for real-time large scale machine learning", "stars": "1.8k"}, {"name": "MLlib", "url": "https://spark.apache.org/mllib/", "description": "MLlib is Apache Spark's scalable machine learning library."}, {"name": "R", "url": "http://www.r-project.org/", "description": "R is a free software environment for statistical computing and graphics."}, {"name": "RHadoop", "url": "https://github.com/RevolutionAnalytics/RHadoop/wiki", "description": "", "stars": "762"}, {"name": "Apache Lens", "url": "http://lens.apache.org/", "description": ""}, {"name": "Apache SINGA (incubating)", "url": "https://singa.incubator.apache.org/", "description": "SINGA is a general distributed deep learning platform for training big deep learning models over large datasets"}, {"name": "BigDL", "url": "https://bigdl-project.github.io/", "description": "BigDL is a distributed deep learning library for Apache Spark; with BigDL, users can write their deep learning applications as standard Spark programs, which can directly run on top of existing Spark or Hadoop clusters."}, {"name": "Apache Hivemall (incubating)", "url": "http://hivemall.incubator.apache.org/", "description": "Apache Hivemall is a scalable machine learning library that runs on Apache Hive, Spark and Pig."}, {"name": "Hadoop Weekly", "url": "http://www.hadoopweekly.com/", "description": ""}, {"name": "The Hadoop Ecosystem Table", "url": "http://hadoopecosystemtable.github.io/", "description": ""}, {"name": "Hadoop illuminated", "url": "http://hadoopilluminated.com/", "description": "Open Source Hadoop Book"}, {"name": "AWS BigData Blog", "url": "http://blogs.aws.amazon.com/bigdata/", "description": ""}, {"name": "Hadoop360", "url": "http://www.hadoop360.com/", "description": ""}, {"name": "How to monitor Hadoop metrics", "url": "https://www.datadoghq.com/blog/monitor-hadoop-metrics/", "description": ""}, {"name": "Apache Hadoop In Theory And Practice", "url": "http://www.slideshare.net/AdamKawa/hadoop-intheoryandpractice", "description": ""}, {"name": "Hadoop Operations at LinkedIn", "url": "http://www.slideshare.net/allenwittenauer/2013-hadoopsummitemea", "description": ""}, {"name": "Hadoop Performance at LinkedIn", "url": "http://www.slideshare.net/allenwittenauer/2012-lihadoopperf", "description": ""}, {"name": "Docker based Hadoop provisioning", "url": "http://www.slideshare.net/JanosMatyas/docker-based-hadoop-provisioning", "description": ""}, {"name": "Hadoop: The Definitive Guide", "url": "http://www.amazon.com/gp/product/1449311520/ref=as_li_ss_tl?ie=UTF8\\&camp=1789\\&creative=390957\\&creativeASIN=1449311520\\&linkCode=as2\\&tag=matratsblo-20", "description": ""}, {"name": "Hadoop Operations", "url": "http://www.amazon.com/gp/product/1449327052/ref=as_li_ss_tl?ie=UTF8\\&camp=1789\\&creative=390957\\&creativeASIN=1449327052\\&linkCode=as2\\&tag=matratsblo-20", "description": ""}, {"name": "Apache Hadoop Yarn", "url": "http://www.amazon.com/dp/0321934504?tag=matratsblo-20", "description": ""}, {"name": "HBase: The Definitive Guide", "url": "http://shop.oreilly.com/product/0636920014348.do", "description": ""}, {"name": "Programming Pig", "url": "http://shop.oreilly.com/product/0636920018087.do", "description": ""}, {"name": "Programming Hive", "url": "http://shop.oreilly.com/product/0636920023555.do", "description": ""}, {"name": "Hadoop in Practice, Second Edition", "url": "http://www.manning.com/holmes2/", "description": ""}, {"name": "Hadoop in Action, Second Edition", "url": "http://www.manning.com/lam2/", "description": ""}, {"name": "ApacheCon", "url": "http://www.apachecon.com/", "description": ""}, {"name": "Strata + Hadoop World", "url": "http://conferences.oreilly.com/strata", "description": ""}, {"name": "DataWorks Summit", "url": "https://dataworkssummit.com/", "description": ""}, {"name": "Spark Summit", "url": "https://databricks.com/sparkaisummit", "description": ""}], "notes": []}], "total_entries": 140}, {"name": "Streaming", "subcategories": [{"name": "Streaming \u2014 Table of Contents", "entries": [{"name": "Streaming Engine", "url": "#streaming-engine", "description": ""}, {"name": "Streaming Library", "url": "#streaming-library", "description": ""}, {"name": "Streaming Application", "url": "#streaming-application", "description": ""}, {"name": "IoT", "url": "#iot", "description": ""}, {"name": "DSL", "url": "#dsl", "description": ""}, {"name": "Data Pipeline", "url": "#data-pipeline", "description": ""}, {"name": "Online Machine Learning", "url": "#online-machine-learning", "description": ""}, {"name": "Streaming SQL", "url": "#streaming-sql", "description": ""}, {"name": "Toolkit", "url": "#toolkit", "description": ""}, {"name": "Benchmark", "url": "#benchmark", "description": ""}, {"name": "Closed Source", "url": "#closed-source", "description": ""}, {"name": "Readings", "url": "#readings", "description": ""}], "notes": []}, {"name": "Streaming \u2014 Streaming Engine", "entries": [{"name": "Apache Apex", "url": "https://github.com/apache/apex-core", "description": "", "stars": "350"}, {"name": "Apache Ballista", "url": "https://github.com/apache/arrow-ballista", "description": "", "stars": "2k"}, {"name": "Apache Flink", "url": "https://github.com/apache/flink", "description": "", "stars": "26k"}, {"name": "Apache Heron (incubating)", "url": "https://github.com/apache/incubator-heron", "description": "", "stars": "3.7k"}, {"name": "Apache Samza", "url": "https://github.com/apache/samza", "description": "", "stars": "837"}, {"name": "Apache Spark Streaming", "url": "https://github.com/apache/spark", "description": "", "stars": "43k"}, {"name": "Apache Storm", "url": "https://github.com/apache/storm", "description": "", "stars": "6.7k"}, {"name": "ArkFlow", "url": "https://github.com/arkflow-rs/arkflow", "description": "", "stars": "1.2k"}, {"name": "Arroyo", "url": "https://github.com/ArroyoSystems/arroyo", "description": "", "stars": "4.8k"}, {"name": "AthenaX", "url": "https://github.com/uber/AthenaX", "description": "", "stars": "1.2k"}, {"name": "Bytewax", "url": "https://github.com/bytewax/bytewax", "description": "", "stars": "2k"}, {"name": "CocoIndex", "url": "https://github.com/cocoindex-io/cocoindex", "description": "", "stars": "6k"}, {"name": "Faust", "url": "https://github.com/robinhood/faust", "description": "", "stars": "6.8k"}, {"name": "Gearpump", "url": "https://github.com/gearpump/gearpump", "description": "", "stars": "758"}, {"name": "Hazelcast Jet", "url": "https://github.com/hazelcast/hazelcast-jet", "description": "", "stars": "1.1k"}, {"name": "hailstorm", "url": "https://github.com/hailstorm-hs/hailstorm", "description": "", "stars": "92"}, {"name": "Maki Nage", "url": "https://github.com/maki-nage/makinage", "description": "", "stars": "42"}, {"name": "mantis", "url": "https://github.com/Netflix/mantis", "description": "", "stars": "1.5k"}, {"name": "mupd8(muppet)", "url": "https://github.com/walmartlabs/mupd8", "description": "", "stars": "128"}, {"name": "NebulaStream", "url": "https://github.com/nebulastream/nebulastream", "description": "", "stars": "75"}, {"name": "Numaflow", "url": "https://github.com/numaproj/numaflow", "description": "", "stars": "2.4k"}, {"name": "Onyx", "url": "https://github.com/onyx-platform/onyx", "description": "", "stars": "2k"}, {"name": "Pathway", "url": "https://github.com/pathwaycom/pathway", "description": "", "stars": "59k"}, {"name": "s4", "url": "https://github.com/apache/incubator-s4", "description": "", "stars": "43"}, {"name": "SABER", "url": "https://github.com/lsds/Saber", "description": "", "stars": "42"}, {"name": "Scramjet Cloud Platform", "url": "https://github.com/scramjetorg/transform-hub", "description": "", "stars": "69"}, {"name": "SPQR", "url": "https://github.com/ottogroup/SPQR", "description": "", "stars": "30"}, {"name": "tigon", "url": "https://github.com/caskdata/tigon", "description": "", "stars": "285"}, {"name": "Teknek", "url": "https://github.com/edwardcapriolo/teknek-core", "description": "", "stars": "9"}, {"name": "Trill", "url": "https://github.com/Microsoft/trill", "description": "", "stars": "1.3k"}, {"name": "Wallaroo", "url": "https://github.com/WallarooLabs/wallaroo", "description": "", "stars": "1.5k"}, {"name": "LightSaber", "url": "https://github.com/lsds/LightSaber", "description": "", "stars": "73"}, {"name": "HStreamDB", "url": "https://github.com/hstreamdb/hstream", "description": "", "stars": "726"}, {"name": "Kuiper", "url": "https://github.com/emqx/kuiper", "description": "", "stars": "1.7k"}, {"name": "WindFlow", "url": "https://paragroup.github.io/WindFlow", "description": ""}, {"name": "RisingWave", "url": "https://github.com/risingwavelabs/risingwave", "description": "", "stars": "8.8k"}], "notes": []}, {"name": "Streaming \u2014 Streaming Library", "entries": [{"name": "Apache Kafka Streams", "url": "https://github.com/apache/kafka", "description": "", "stars": "32k"}, {"name": "Streamiz", "url": "https://github.com/LGouellec/kafka-streams-dotnet", "description": "", "stars": "525"}, {"name": "Akka Streams", "url": "https://github.com/akka/akka", "description": "", "stars": "13k"}, {"name": "Daggy", "url": "https://github.com/synacker/daggy", "description": "", "stars": "158"}, {"name": "Benthos", "url": "https://github.com/Jeffail/benthos", "description": "", "stars": "8.6k"}, {"name": "FS2(prev. 'Scalaz-Stream')", "url": "https://github.com/functional-streams-for-scala/fs2", "description": "", "stars": "2.4k"}, {"name": "FastStream", "url": "https://github.com/airtai/faststream", "description": "", "stars": "4.9k"}, {"name": "monix", "url": "https://github.com/monix/monix", "description": "", "stars": "1.9k"}, {"name": "Quix Streams", "url": "https://github.com/quixio/quix-streams", "description": "", "stars": "1.5k"}, {"name": "Scramjet Node.js", "url": "https://github.com/scramjetorg/framework-js", "description": "\\[Node.js] functional reactive stream programming framework written on top of Node.js object streams + [the legacy Scramjet.js version (\u2b50253)](https://github.com/scramjetorg/scramjet)", "stars": "39"}, {"name": "Scramjet Python", "url": "https://github.com/scramjetorg/framework-python", "description": "\\[Python] functional reactive stream programming framework written from scratch operating on object, string and buffer streams.", "stars": "35"}, {"name": "Scramjet C++", "url": "https://github.com/scramjetorg/framework-cpp", "description": "\\[C++] functional reactive stream programming framework written on top of Node.js object streams.", "stars": "3"}, {"name": "Streamline", "url": "https://github.com/hortonworks/streamline", "description": "", "stars": "166"}, {"name": "StreamAlert", "url": "https://github.com/airbnb/streamalert", "description": "", "stars": "2.9k"}, {"name": "Swave", "url": "https://github.com/sirthias/swave", "description": "", "stars": "173"}, {"name": "Streamz", "url": "https://github.com/python-streamz/streamz", "description": "", "stars": "1.3k"}, {"name": "Stream Ops", "url": "https://github.com/nanosai/stream-ops-java", "description": "", "stars": "49"}, {"name": "Substation", "url": "https://github.com/brexhq/substation", "description": "", "stars": "390"}, {"name": "Tributary", "url": "https://github.com/timkpaine/tributary", "description": "", "stars": "459"}, {"name": "YoMo", "url": "https://github.com/yomorun/yomo", "description": "", "stars": "1.9k"}, {"name": "Mediapipe", "url": "https://github.com/google/mediapipe", "description": "Cross-platform, customizable ML solutions for live and streaming media.", "stars": "34k"}], "notes": []}, {"name": "Streaming \u2014 Streaming Application", "entries": [{"name": "javactrl-kafka", "url": "https://github.com/javactrl/javactrl-kafka", "description": "", "stars": "15"}, {"name": "straw", "url": "https://github.com/rwalk/straw", "description": "", "stars": "102"}, {"name": "storm-crawler", "url": "https://github.com/DigitalPebble/storm-crawler", "description": "", "stars": "961"}, {"name": "Zilla", "url": "https://github.com/aklivity/zilla", "description": "", "stars": "676"}], "notes": []}, {"name": "Streaming \u2014 IoT", "entries": [{"name": "sensorbee", "url": "https://github.com/sensorbee/sensorbee", "description": "", "stars": "230"}, {"name": "Apache Edgent", "url": "https://github.com/apache/incubator-edgent", "description": "", "stars": "223"}, {"name": "Apache StreamPipes", "url": "https://github.com/apache/incubator-streampipes", "description": "", "stars": "705"}], "notes": []}, {"name": "Streaming \u2014 DSL", "entries": [{"name": "Apache Beam", "url": "https://github.com/apache/beam", "description": "", "stars": "8.5k"}, {"name": "coast", "url": "https://github.com/bkirwi/coast", "description": "", "stars": "60"}, {"name": "Esper", "url": "https://github.com/espertechinc/esper", "description": "", "stars": "874"}, {"name": "Streamparse", "url": "https://github.com/Parsely/streamparse", "description": "", "stars": "1.5k"}, {"name": "summingbird", "url": "https://github.com/twitter/summingbird", "description": "", "stars": "2.1k"}], "notes": []}, {"name": "Streaming \u2014 Data Pipeline", "entries": [{"name": "Apache Kafka", "url": "https://github.com/apache/kafka", "description": "", "stars": "32k"}, {"name": "Apache Pulsar", "url": "https://github.com/apache/incubator-pulsar", "description": "", "stars": "15k"}, {"name": "Apache RocketMQ", "url": "https://github.com/apache/rocketmq", "description": "", "stars": "22k"}, {"name": "AutoMQ", "url": "https://github.com/AutoMQ/automq", "description": "", "stars": "9.5k"}, {"name": "brooklin", "url": "https://github.com/linkedin/Brooklin/", "description": "", "stars": "952"}, {"name": "Bruin", "url": "https://github.com/bruin-data/bruin", "description": "", "stars": "1.4k"}, {"name": "camus", "url": "https://github.com/linkedin/camus", "description": "", "stars": "883"}, {"name": "databus", "url": "https://github.com/linkedin/databus", "description": "", "stars": "3.7k"}, {"name": "flume", "url": "https://github.com/apache/flume", "description": "", "stars": "2.6k"}, {"name": "fluvio", "url": "https://github.com/infinyon/fluvio", "description": "", "stars": "5.2k"}, {"name": "ingestr", "url": "https://github.com/bruin-data/ingestr", "description": "", "stars": "3.4k"}, {"name": "Gazette", "url": "https://github.com/gazette/core", "description": "", "stars": "785"}, {"name": "LogDevice", "url": "https://logdevice.io/", "description": ""}, {"name": "metaq", "url": "https://github.com/killme2008/Metamorphosis", "description": "", "stars": "1.3k"}, {"name": "NATS streaming", "url": "https://github.com/nats-io/nats-streaming-server", "description": "", "stars": "2.5k"}, {"name": "nsq", "url": "https://github.com/nsqio/nsq", "description": "", "stars": "26k"}, {"name": "Redpanda", "url": "https://github.com/redpanda-data/redpanda", "description": "", "stars": "12k"}, {"name": "RudderStack", "url": "https://github.com/rudderlabs/rudder-server", "description": "", "stars": "4.4k"}, {"name": "suro", "url": "https://github.com/Netflix/suro", "description": "", "stars": "796"}], "notes": []}, {"name": "Streaming \u2014 Online Machine Learning", "entries": [{"name": "Apache Samoa", "url": "https://github.com/apache/incubator-samoa", "description": "", "stars": "250"}, {"name": "DataSketches", "url": "https://github.com/DataSketches/sketches-core", "description": "", "stars": "945"}, {"name": "River", "url": "https://github.com/online-ml/river", "description": "", "stars": "5.7k"}, {"name": "streamDM", "url": "https://github.com/huawei-noah/streamDM", "description": "", "stars": "500"}, {"name": "StreamingBandit", "url": "https://github.com/Nth-iteration-labs/streamingbandit", "description": "", "stars": "83"}, {"name": "StormCV", "url": "https://github.com/sensorstorm/StormCV", "description": "", "stars": "172"}, {"name": "trident-ml", "url": "https://github.com/pmerienne/trident-ml", "description": "", "stars": "384"}, {"name": "yurita", "url": "https://github.com/paypal/yurita", "description": "", "stars": "108"}], "notes": []}, {"name": "Streaming \u2014 Streaming SQL", "entries": [{"name": "pipelinedb", "url": "https://github.com/pipelinedb/pipelinedb", "description": "", "stars": "2.7k"}, {"name": "squall", "url": "https://github.com/epfldata/squall", "description": "", "stars": "273"}, {"name": "StreamCQL", "url": "https://github.com/Zhiqiang-He/StreamCQL", "description": "", "stars": "0"}, {"name": "ksqlDB", "url": "https://github.com/confluentinc/ksql", "description": "", "stars": "285"}, {"name": "Materialize", "url": "https://materialize.com", "description": ""}, {"name": "Siddhi", "url": "https://github.com/siddhi-io/siddhi", "description": "", "stars": "1.6k"}, {"name": "Proton", "url": "https://github.com/timeplus-io/proton", "description": "", "stars": "2.1k"}], "notes": []}, {"name": "Streaming \u2014 Benchmark", "entries": [{"name": "storm-perf-test", "url": "https://github.com/yahoo/storm-perf-test", "description": "", "stars": "74"}, {"name": "streaming-benchmarks", "url": "https://github.com/yahoo/streaming-benchmarks", "description": "", "stars": "647"}, {"name": "flotilla", "url": "https://github.com/tylertreat/Flotilla", "description": "", "stars": "237"}], "notes": []}, {"name": "Streaming \u2014 Toolkit", "entries": [{"name": "akka", "url": "https://github.com/akka/akka", "description": "", "stars": "13k"}, {"name": "Apache Pekko", "url": "https://github.com/apache/incubator-pekko", "description": "", "stars": "1.5k"}, {"name": "pulsar", "url": "https://github.com/quantmind/pulsar/", "description": "", "stars": "1.9k"}, {"name": "aeron", "url": "https://github.com/real-logic/Aeron", "description": "", "stars": "8.4k"}, {"name": "StreamFlow", "url": "https://github.com/lmco/streamflow", "description": "", "stars": "255"}, {"name": "samza-luwak", "url": "https://github.com/romseygeek/samza-luwak", "description": "", "stars": "100"}, {"name": "Streamdal", "url": "https://streamdal.com", "description": ""}, {"name": "Turbine", "url": "https://github.com/Netflix/Turbine", "description": "", "stars": "833"}, {"name": "Nussknacker", "url": "https://github.com/TouK/nussknacker", "description": "", "stars": "708"}], "notes": []}, {"name": "Streaming \u2014 Closed Source", "entries": [{"name": "Amazon Kinesis Streams", "url": "https://aws.amazon.com/kinesis/", "description": ""}, {"name": "Azure Stream Analytics", "url": "https://azure.microsoft.com/en-us/services/stream-analytics/", "description": ""}, {"name": "Cloud Dataflow", "url": "https://cloud.google.com/dataflow/", "description": ""}, {"name": "concord", "url": "https://www.slideshare.net/concord-io/may-2016-data-by-the-bay-concord-simple-flexible-stream-processing-on-apache-mesos", "description": ""}, {"name": "IBM Streams", "url": "https://www.ibm.com/analytics/us/en/technology/stream-computing/", "description": ""}, {"name": "jubatus", "url": "http://jubat.us/en/", "description": ""}, {"name": "millwheel", "url": "http://research.google.com/pubs/pub41378.html", "description": "framework for building low-latency data-processing applications that is widely used at Google."}, {"name": "NVIDIA Deep Stream", "url": "https://developer.nvidia.com/deepstream-sdk", "description": ""}], "notes": []}, {"name": "Streaming \u2014 Readings", "entries": [], "notes": []}], "total_entries": 135}, {"name": "Spark", "subcategories": [{"name": "Spark \u2014 Language Bindings", "entries": [{"name": "Kotlin for Apache Spark", "url": "https://github.com/Kotlin/kotlin-spark-api", "description": "", "stars": "459"}, {"name": ".NET for Apache Spark", "url": "https://github.com/dotnet/spark", "description": "", "stars": "2k"}, {"name": "sparklyr", "url": "https://github.com/rstudio/sparklyr", "description": "", "stars": "952"}, {"name": "sparkle", "url": "https://github.com/tweag/sparkle", "description": "", "stars": "447"}, {"name": "spark-connect-rs", "url": "https://github.com/sjrusso8/spark-connect-rs", "description": "", "stars": "85"}, {"name": "spark-connect-go", "url": "https://github.com/apache/spark-connect-go", "description": "", "stars": "155"}, {"name": "spark-connect-csharp", "url": "https://github.com/mdrakiburrahman/spark-connect-csharp", "description": "", "stars": "1"}], "notes": []}, {"name": "Spark \u2014 Notebooks and IDEs", "entries": [{"name": "almond", "url": "https://almond.sh/", "description": ""}, {"name": "Apache Zeppelin", "url": "https://zeppelin.incubator.apache.org/", "description": ""}, {"name": "Polynote", "url": "https://polynote.org/", "description": ""}, {"name": "sparkmagic", "url": "https://github.com/jupyter-incubator/sparkmagic", "description": "", "stars": "1.3k"}], "notes": []}, {"name": "Spark \u2014 General Purpose Libraries", "entries": [{"name": "itachi", "url": "https://github.com/yaooqinn/itachi", "description": "", "stars": "56"}, {"name": "spark-daria", "url": "https://github.com/mrpowers-io/spark-daria", "description": "", "stars": "751"}, {"name": "quinn", "url": "https://github.com/mrpowers-io/quinn", "description": "", "stars": "632"}, {"name": "Apache DataFu", "url": "https://github.com/apache/datafu/tree/master/datafu-spark", "description": "", "stars": "115"}, {"name": "Joblib Apache Spark Backend", "url": "https://github.com/joblib/joblib-spark", "description": "", "stars": "241"}], "notes": []}, {"name": "Spark \u2014 SQL Data Sources", "entries": [{"name": "Spark XML", "url": "https://github.com/databricks/spark-xml", "description": "", "stars": "504"}, {"name": "Spark Cassandra Connector", "url": "https://github.com/datastax/spark-cassandra-connector", "description": "", "stars": "1.9k"}, {"name": "Mongo-Spark", "url": "https://github.com/mongodb/mongo-spark", "description": "", "stars": "710"}], "notes": []}, {"name": "Spark \u2014 Storage", "entries": [{"name": "Delta Lake", "url": "https://github.com/delta-io/delta", "description": "", "stars": "7.5k"}, {"name": "Apache Hudi", "url": "https://github.com/apache/hudi", "description": "", "stars": "5.4k"}, {"name": "Apache Iceberg", "url": "https://github.com/apache/iceberg", "description": "", "stars": "6.4k"}, {"name": "lakeFS", "url": "https://docs.lakefs.io/integrations/spark.html", "description": ""}], "notes": []}, {"name": "Spark \u2014 Bioinformatics", "entries": [{"name": "ADAM", "url": "https://github.com/bigdatagenomics/adam", "description": "", "stars": "1k"}, {"name": "Hail", "url": "https://github.com/hail-is/hail", "description": "", "stars": "976"}], "notes": []}, {"name": "Spark \u2014 GIS", "entries": [{"name": "Apache Sedona", "url": "https://github.com/apache/incubator-sedona", "description": "", "stars": "2k"}], "notes": []}, {"name": "Spark \u2014 Graph Processing", "entries": [{"name": "GraphFrames", "url": "https://github.com/graphframes/graphframes", "description": "", "stars": "997"}, {"name": "neo4j-spark-connector", "url": "https://github.com/neo4j-contrib/neo4j-spark-connector", "description": "", "stars": "313"}], "notes": []}, {"name": "Spark \u2014 Machine Learning Extension", "entries": [{"name": "Apache SystemML", "url": "https://systemml.apache.org/", "description": ""}, {"name": "Mahout Spark Bindings", "url": "https://mahout.apache.org/users/sparkbindings/home.html", "description": ""}, {"name": "KeystoneML", "url": "http://keystone-ml.org/", "description": "Type safe machine learning pipelines with RDDs."}, {"name": "JPMML-Spark", "url": "https://github.com/jpmml/jpmml-spark", "description": "", "stars": "94"}, {"name": "ModelDB", "url": "https://mitdbg.github.io/modeldb", "description": ""}, {"name": "Sparkling Water", "url": "https://github.com/h2oai/sparkling-water", "description": "", "stars": "965"}, {"name": "BigDL", "url": "https://github.com/intel-analytics/BigDL", "description": "", "stars": "6.6k"}, {"name": "MLeap", "url": "https://github.com/combust/mleap", "description": "", "stars": "1.5k"}, {"name": "Microsoft ML for Apache Spark", "url": "https://github.com/Azure/mmlspark", "description": "", "stars": "5.1k"}, {"name": "MLflow", "url": "https://mlflow.org/docs/latest/python_api/mlflow.spark.html#module-mlflow.spark", "description": ""}], "notes": []}, {"name": "Spark \u2014 Middleware", "entries": [{"name": "Livy", "url": "https://github.com/apache/incubator-livy", "description": "", "stars": "883"}, {"name": "spark-jobserver", "url": "https://github.com/spark-jobserver/spark-jobserver", "description": "", "stars": "2.8k"}, {"name": "Apache Toree", "url": "https://github.com/apache/incubator-toree", "description": "", "stars": "739"}, {"name": "Apache Kyuubi", "url": "https://github.com/apache/kyuubi", "description": "", "stars": "2.1k"}], "notes": []}, {"name": "Spark \u2014 Monitoring", "entries": [{"name": "Data Mechanics Delight", "url": "https://github.com/datamechanics/delight", "description": "", "stars": "342"}], "notes": []}, {"name": "Spark \u2014 Utilities", "entries": [{"name": "sparkly", "url": "https://github.com/Tubular/sparkly", "description": "", "stars": "60"}, {"name": "Flintrock", "url": "https://github.com/nchammas/flintrock", "description": "", "stars": "638"}, {"name": "Optimus", "url": "https://github.com/ironmussa/Optimus/", "description": "", "stars": "1.5k"}], "notes": []}, {"name": "Spark \u2014 Natural Language Processing", "entries": [{"name": "spark-nlp", "url": "https://github.com/JohnSnowLabs/spark-nlp", "description": "", "stars": "3.9k"}], "notes": []}, {"name": "Spark \u2014 Streaming", "entries": [{"name": "Apache Bahir", "url": "https://bahir.apache.org/", "description": ""}], "notes": []}, {"name": "Spark \u2014 Interfaces", "entries": [{"name": "Apache Beam", "url": "https://beam.apache.org/", "description": ""}, {"name": "Koalas", "url": "https://github.com/databricks/koalas", "description": "", "stars": "3.3k"}], "notes": []}, {"name": "Spark \u2014 Data quality", "entries": [{"name": "deequ", "url": "https://github.com/awslabs/deequ", "description": "", "stars": "3.3k"}, {"name": "python-deequ", "url": "https://github.com/awslabs/python-deequ", "description": "", "stars": "717"}], "notes": []}, {"name": "Spark \u2014 Testing", "entries": [{"name": "spark-testing-base", "url": "https://github.com/holdenk/spark-testing-base", "description": "", "stars": "1.5k"}, {"name": "spark-fast-tests", "url": "https://github.com/mrpowers-io/spark-fast-tests", "description": "", "stars": "432"}, {"name": "chispa", "url": "https://github.com/MrPowers/chispa", "description": "", "stars": "606"}], "notes": []}, {"name": "Spark \u2014 Web Archives", "entries": [{"name": "Archives Unleashed Toolkit", "url": "https://github.com/archivesunleashed/aut", "description": "", "stars": "137"}], "notes": []}, {"name": "Spark \u2014 Workflow Management", "entries": [{"name": "Cromwell", "url": "https://github.com/broadinstitute/cromwell#spark-backend", "description": "", "stars": "993"}], "notes": []}, {"name": "Spark \u2014 Books", "entries": [{"name": "Learning Spark, 2nd Edition", "url": "https://www.oreilly.com/library/view/learning-spark-2nd/9781492050032/", "description": "Introduction to Spark API with Spark 3.0 covered. Good source of knowledge about basic concepts."}, {"name": "Advanced Analytics with Spark", "url": "http://shop.oreilly.com/product/0636920035091.do", "description": "Useful collection of Spark processing patterns. Accompanying GitHub repository: [sryza/aas (\u2b501.5k)](https://github.com/sryza/aas)."}, {"name": "Mastering Apache Spark", "url": "https://jaceklaskowski.gitbooks.io/mastering-apache-spark/", "description": "Interesting compilation of notes by [Jacek Laskowski](https://github.com/jaceklaskowski). Focused on different aspects of Spark internals."}, {"name": "Spark in Action", "url": "https://www.manning.com/books/spark-in-action", "description": "New book in the Manning's \"in action\" family with +400 pages. Starts gently, step-by-step and covers large number of topics. Free excerpt on how to [setup Eclipse for Spark application development](http://freecontent.manning.com/how-to-start-developing-spark-applications-in-eclipse/) and how to bootstrap a new application using the provided Maven Archetype. You can find the accompanying GitHub repo [here (\u2b50273)](https://github.com/spark-in-action/first-edition)."}], "notes": []}, {"name": "Spark \u2014 Papers", "entries": [{"name": "Large-Scale Intelligent Microservices", "url": "https://arxiv.org/pdf/2009.08044.pdf", "description": "Microsoft paper that presents an Apache Spark-based micro-service orchestration framework that extends database operations to include web service primitives."}, {"name": "Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing", "url": "https://people.csail.mit.edu/matei/papers/2012/nsdi_spark.pdf", "description": "Paper introducing a core distributed memory abstraction."}, {"name": "Spark SQL: Relational Data Processing in Spark", "url": "https://amplab.cs.berkeley.edu/wp-content/uploads/2015/03/SparkSQLSigmod2015.pdf", "description": "Paper introducing relational underpinnings, code generation and Catalyst optimizer."}, {"name": "Structured Streaming: A Declarative API for Real-Time Applications in Apache Spark", "url": "https://cs.stanford.edu/~matei/papers/2018/sigmod_structured_streaming.pdf", "description": "Structured Streaming is a new high-level streaming API, it is a declarative API based on automatically incrementalizing a static relational query."}], "notes": []}, {"name": "Spark \u2014 MOOCS", "entries": [{"name": "Data Science and Engineering with Apache Spark (edX XSeries)", "url": "https://www.edx.org/xseries/data-science-engineering-apache-spark", "description": "Series of five courses ([Introduction to Apache Spark](https://www.edx.org/course/introduction-apache-spark-uc-berkeleyx-cs105x), [Distributed Machine Learning with Apache Spark](https://www.edx.org/course/distributed-machine-learning-apache-uc-berkeleyx-cs120x), [Big Data Analysis with Apache Spark](https://www.edx.org/course/big-data-analysis-apache-spark-uc-berkeleyx-cs110x), [Advanced Apache Spark for Data Science and Data Engineering](https://www.edx.org/course/advanced-apache-spark-data-science-data-uc-berkeleyx-cs115x), [Advanced Distributed Machine Learning with Apache Spark](https://www.edx.org/course/advanced-distributed-machine-learning-uc-berkeleyx-cs125x)) covering different aspects of software engineering and data science. Python oriented."}, {"name": "Big Data Analysis with Scala and Spark (Coursera)", "url": "https://www.coursera.org/learn/big-data-analysys", "description": "Scala oriented introductory course. Part of [Functional Programming in Scala Specialization](https://www.coursera.org/specializations/scala)."}], "notes": []}, {"name": "Spark \u2014 Workshops", "entries": [{"name": "AMP Camp", "url": "http://ampcamp.berkeley.edu", "description": "Periodical training event organized by the [UC Berkeley AMPLab](https://amplab.cs.berkeley.edu/). A source of useful exercise and recorded workshops covering different tools from the [Berkeley Data Analytics Stack](https://amplab.cs.berkeley.edu/software/)."}], "notes": []}, {"name": "Spark \u2014 Projects Using Spark", "entries": [{"name": "Oryx 2", "url": "https://github.com/OryxProject/oryx", "description": "[Lambda architecture](http://lambda-architecture.net/) platform built on Apache Spark and [Apache Kafka](http://kafka.apache.org/) with specialization for real-time large scale machine learning.", "stars": "1.8k"}, {"name": "Photon ML", "url": "https://github.com/linkedin/photon-ml", "description": "A machine learning library supporting classical Generalized Mixed Model and Generalized Additive Mixed Effect Model.", "stars": "793"}, {"name": "PredictionIO", "url": "https://prediction.io/", "description": "Machine Learning server for developers and data scientists to build and deploy predictive applications in a fraction of the time."}, {"name": "Crossdata", "url": "https://github.com/Stratio/Crossdata", "description": "Data integration platform with extended DataSource API and multi-user environment.", "stars": "169"}], "notes": []}, {"name": "Spark \u2014 Docker Images", "entries": [{"name": "apache/spark", "url": "https://hub.docker.com/r/apache/spark", "description": "Apache Spark Official Docker images."}, {"name": "jupyter/docker-stacks/pyspark-notebook", "url": "https://github.com/jupyter/docker-stacks/tree/master/pyspark-notebook", "description": "PySpark with Jupyter Notebook and Mesos client.", "stars": "8k"}, {"name": "sequenceiq/docker-spark", "url": "https://github.com/sequenceiq/docker-spark", "description": "Yarn images from [SequenceIQ](http://www.sequenceiq.com/).", "stars": "765"}, {"name": "datamechanics/spark", "url": "https://hub.docker.com/r/datamechanics/spark", "description": "An easy to setup Docker image for Apache Spark from [Data Mechanics](https://www.datamechanics.co/)."}], "notes": []}, {"name": "Spark \u2014 Miscellaneous", "entries": [{"name": "Spark with Scala Gitter channel", "url": "https://gitter.im/spark-scala/Lobby", "description": "\"*A place to discuss and ask questions about using Scala for Spark programming*\" started by [@deanwampler](https://github.com/deanwampler)."}, {"name": "Apache Spark User List", "url": "http://apache-spark-user-list.1001560.n3.nabble.com/", "description": ""}], "notes": []}], "total_entries": 78}, {"name": "Splunk", "subcategories": [{"name": "Splunk", "entries": [{"name": "Basics", "url": "#basics", "description": ""}, {"name": "Apps", "url": "#apps", "description": ""}, {"name": "Visualisations", "url": "#visualisations", "description": ""}, {"name": "Conferences, Meet-Ups and Socialising", "url": "#conferences-meet-ups-and-socialising", "description": ""}, {"name": "Unofficial Resources", "url": "#unofficial-resources", "description": ""}, {"name": "Splunk Website", "url": "https://splunk.com", "description": "Splunk's Homepage."}, {"name": "Splunk Answers", "url": "https://answers.splunk.com", "description": "Splunk's Community Questions and Answers."}, {"name": "SplunkBase", "url": "https://splunkbase.splunk.com", "description": "Splunk and Community built apps and add-ons."}, {"name": "Splunk Blogs", "url": "https://blogs.splunk.com/", "description": "Blog posts on various topics."}, {"name": "Splunk Dev", "url": "https://dev.splunk.com", "description": "Develop on Splunk."}, {"name": "Splunk Docs", "url": "https://docs.splunk.com/", "description": "Documentation."}, {"name": "Splunk App for Infrastructure", "url": "https://www.splunk.com/en_us/software/splunk-enterprise/server-and-infrastructure-monitoring-and-troubleshooting.html", "description": "Correlate logs and metrics for infrastructure monitoring."}, {"name": "Miscellaneous Scripts for fixing issues with the Universal Forwarder", "url": "https://github.com/jimmyatSplunk/SplunkForwarderRepairKit", "description": "This kit was compiled based on common issues with Splunk deployments and managing idiosyncrasies that tend to naturally occur.", "stars": "23"}], "notes": []}, {"name": "Splunk \u2014 Premium Apps", "entries": [{"name": "ES Home Page", "url": "https://www.splunk.com/en_us/software/enterprise-security.html", "description": "Splunk's Home Page for Enterprise Security."}, {"name": "ES Splunkbase Entry", "url": "https://splunkbase.splunk.com/app/263/", "description": "Download page (if licensed)."}, {"name": "ES Documentation", "url": "https://docs.splunk.com/Documentation/ES/latest", "description": "Splunk documentation for Enterprise Security."}, {"name": "Awesome-ES", "url": "https://github.com/sduff/awesome-es/", "description": "An Awesome list for all things Enterprise Security.", "stars": "12"}, {"name": "ITSI Home Page", "url": "https://www.splunk.com/en_us/software/it-service-intelligence.html", "description": "Splunk's Home Page for IT Service Intelligence."}, {"name": "ITSI Splunkbase Entry", "url": "https://splunkbase.splunk.com/app/1841/", "description": "Download page (if licensed)."}, {"name": "ITSI Documentation", "url": "https://docs.splunk.com/Documentation/ITSI/latest", "description": "ITSI Documentation."}, {"name": "Awesome-ITSI", "url": "https://github.com/sduff/awesome-itsi/", "description": "An Awesome list for all things IT Service Intelligence.", "stars": "13"}, {"name": "Event Timeline Viz", "url": "https://splunkbase.splunk.com/app/4370/", "description": "Interactive timeline with call-outs for events."}, {"name": "Timeline", "url": "https://splunkbase.splunk.com/app/3120/", "description": "Interactive timeline."}, {"name": "Halo", "url": "https://splunkbase.splunk.com/app/3514/", "description": "Hierarchical, relational pie charts."}, {"name": "Heat Map", "url": "https://splunkbase.splunk.com/app/4460/", "description": "A grid of related measurements, colour intensity derived from the value."}, {"name": "Calendar Heat Map", "url": "https://splunkbase.splunk.com/app/3162/", "description": "Heatmap broken down by days."}, {"name": "Punchcard", "url": "https://splunkbase.splunk.com/app/3129/", "description": "Punchcard Visualisation."}, {"name": "Horizon Chart", "url": "https://splunkbase.splunk.com/app/3117/", "description": "Horizon Chart Visualisation."}, {"name": "Sankey Diagram", "url": "https://splunkbase.splunk.com/app/3112/", "description": "Sankey Diagram Visualisation."}, {"name": "WebGL Globe", "url": "https://splunkbase.splunk.com/app/3674/", "description": "Spinning globe with events correlated to locations (flashy C-level eye-candy)."}, {"name": "Splunkbase Custom Visualizations", "url": "https://splunkbase.splunk.com/apps/#/app_content/visualizations", "description": "Download other custom visualizations from Splunkbase."}, {"name": "UserGroups", "url": "https://usergroups.splunk.com/", "description": "Find a nearby usergroup."}, {"name": ".Conf", "url": "https://conf.splunk.com", "description": "Splunk's annual conference website."}, {"name": "Splunk UserGroups Slack", "url": "http://splk.it/slack", "description": "Splunk's publicly accessible Slack."}, {"name": "/r/Splunk", "url": "https://reddit.com/r/splunk", "description": "Unofficial Sub-Reddit."}, {"name": "IRC", "url": "https://wiki.splunk.com/Community:IRC", "description": "Instructions for connecting to `#splunk` of Efnet."}, {"name": "Splunk Store", "url": "https://www.mylogocloud.com/splunk", "description": "Order some Splunk Schwag you missed from a meetup or .conf."}, {"name": "Splunk Trust", "url": "https://www.splunk.com/en_us/community/splunk-trust.html", "description": "The Splunk Trust is an invite only group of Splunk Ninjas."}, {"name": "Simon Duff", "url": "https://simonduff.net/splunk", "description": "Miscellaneous scripts and visualisations."}, {"name": "Ryan Faircloth", "url": "https://www.rfaircloth.com/", "description": "Security and Syslog related materials."}, {"name": "George Starcher", "url": "http://www.georgestarcher.com/", "description": "Many Splunk related items, including details on Splunk ES's Extreme Search."}, {"name": "Anthony Tellez", "url": "https://anthonygtellez.github.io/", "description": "Security and Machine Learning items."}, {"name": "Duane Waddle", "url": "https://www.duanewaddle.com/", "description": "Miscellaneous Splunk items."}, {"name": "Vladimir's GitHub", "url": "https://github.com/hire-vladimir/", "description": "Code for a number of Splunk resources, including [CIM Validation (\u2b5066)](https://github.com/hire-vladimir/SA-cim_vladiator)."}, {"name": "Nico's GitHub", "url": "https://github.com/nicovdw/", "description": "Repository of searches and dashboards to assist with optimising concurrency settings."}, {"name": "David Veuve", "url": "https://www.davidveuve.com/tech/", "description": "Some early resources on Splunk basics and optimisations (infrequently updated)."}, {"name": "GoSplunk", "url": "https://gosplunk.com/", "description": "Search Engine for Splunk Queries split by sourcetype and use-case."}], "notes": []}], "total_entries": 47}, {"name": "Qlik", "subcategories": [{"name": "Qlik", "entries": [{"name": "Official Resources and Documentation", "url": "#official-resources-and-documentation", "description": ""}, {"name": "Community Resources and Blogs", "url": "#community-resources-and-blogs", "description": ""}, {"name": "Books", "url": "#books", "description": ""}, {"name": "Extensions", "url": "#extensions", "description": ""}, {"name": "Qlik Blog", "url": "https://blog.qlik.com", "description": ""}, {"name": "Qlik Help", "url": "https://help.qlik.com", "description": ""}, {"name": "Living QlikView", "url": "http://livingqlikview.com/", "description": "Aaron Couron's blog with in-depth articles, technical tips and tricks, and reviews of third-party applications and resources that help to progress with QlikView and Qlik Sense."}, {"name": "Qlik Community", "url": "https://community.qlik.com", "description": "Qlik Community: product forums, resources, services, events, groups, and blogs on the Qlik platform."}, {"name": "QlikCentral", "url": "https://qlikcentral.com/", "description": "A blog where Richard Pearce shares insight and technical examples he develops in QlikView and Qlik Sense."}, {"name": "QlikFix", "url": "http://www.qlikfix.com", "description": "A blog dedicated to QlikView and Qlik Sense tips, tricks and tutorials from Barry Harmsen."}, {"name": "QlikView Cookbook", "url": "https://qlikviewcookbook.com/", "description": "A technical 'how-to' blog for QlikView developers, by Rob Wunderlich."}, {"name": "Qlik Tips", "url": "https://www.qliktips.com", "description": "An all-things-Qlik blog by Stephen Redmond."}, {"name": "Quick Intelligence", "url": "https://www.quickintelligence.co.uk/blog/", "description": "A blog where a skilled team of Qlik developers lead by Steve Dark, providing tips on working in Qlik Sense and QlikView."}], "notes": []}, {"name": "Qlik \u2014 About Qlik Sense", "entries": [{"name": "Learning Qlik Sense: The Official Guide", "url": "https://books.google.com/books/about/Learning_Qlik_Sense_The_Official_Guide.html?id=4zvlCwAAQBAJ\\&redir_esc=y", "description": "By Christopher Ilacqua, Henric Cronstr\u00f6m et al."}, {"name": "Qlik Sense Cookbook", "url": "https://books.google.com/books/about/Qlik_Sense_Cookbook.html?id=07xouwEACAAJ\\&redir_esc=y", "description": "By Pablo Labbe, Philip Hand et al."}, {"name": "Qlik Sense for Beginners", "url": "https://books.google.com/books/about/Qlik_Sense_for_Beginners.html?id=Dy3nBAAAQBAJ\\&redir_esc=y", "description": "By Mark O'Donovan."}, {"name": "Qlik Sense: Advanced Data Visualization for Your Organization", "url": "https://books.google.ru/books?id=kPNFDwAAQBAJ\\&redir_esc=y", "description": "By Dr. Christopher Ilacqua et al."}, {"name": "Predictive Analytics Using Rattle and Qlik Sense", "url": "https://www.amazon.com/Predictive-Analytics-using-Rattle-Sense/dp/1784395803", "description": "By Ferran Garcia Pagans."}, {"name": "Mastering Qlik Sense", "url": "https://www.amazon.com/Mastering-Qlik-Sense-self-service-Intelligence/dp/1783554029", "description": "By Martin Mahler and Juan Ignacio Vitantonio."}, {"name": "Implementing Qlik Sense", "url": "https://books.google.com/books/about/Implementing_Qlik_Sense.html?id=6nZaswEACAAJ\\&redir_esc=y", "description": "By Ganapati Hegde and Kaushik Solanki."}], "notes": []}, {"name": "Qlik \u2014 About QlikView", "entries": [{"name": "QlikView Essentials", "url": "https://books.google.com/books/about/QlikView_Essentials.html?id=5wMcDAAAQBAJ\\&redir_esc=y", "description": "By Chandraish Sinha."}, {"name": "Creating Stunning Dashboards with QlikView", "url": "https://www.amazon.com/Creating-Stunning-Dashboards-QlikView-Villafuerte/dp/1782175733", "description": "By Juli\u00e1n Villafuerte."}, {"name": "QlikView for Finance", "url": "https://books.google.com/books/about/QlikView_for_Finance.html?id=pNZOCwAAQBAJ\\&redir_esc=y", "description": "By B. Diane Blackwood."}, {"name": "QlikView Unlocked", "url": "https://books.google.com/books/about/QlikView_Unlocked.html?id=vvaoCwAAQBAJ\\&redir_esc=y", "description": "By Roger Stone and Andrew Dove."}, {"name": "Practical QlikView", "url": "https://www.amazon.com/Practical-QlikView-Mark-ODonovan-ebook/dp/B007QMMDL4", "description": "By Mark O'Donovan."}, {"name": "QlikView 11 for Developers", "url": "https://www.amazon.com/QlikView-Developers-Effective-techniques-Intelligence/dp/1849686068", "description": "By Miguel Garc\u00eda and Barry Harmsen."}, {"name": "QlikView Scripting", "url": "https://books.google.com/books/about/QlikView_Scripting.html?id=wng3ngEACAAJ\\&redir_esc=y", "description": "By Matt Floyd."}, {"name": "QlikView Server and Publisher", "url": "https://books.google.com/books/about/QlikView_Server_and_Publisher.html?id=gDOhAgAAQBAJ\\&redir_esc=y", "description": "By Stephen Redmond."}, {"name": "AnyChart", "url": "https://qlik.anychart.com", "description": ""}, {"name": "trueChart", "url": "https://www.truechart.com", "description": ""}, {"name": "VizLib", "url": "https://www.vizlib.com", "description": ""}], "notes": []}], "total_entries": 31}], "list_count": 7, "total_entries": 1216, "subcategory_count": 7}