diff options
| author | Jingjing Ren <renjj@ccs.neu.edu> | 2016-12-09 21:43:18 -0500 |
|---|---|---|
| committer | Jingjing Ren <renjj@ccs.neu.edu> | 2016-12-09 21:43:18 -0500 |
| commit | 4c2ff735326ce7686844c5738bc130bf78f5b9a8 (patch) | |
| tree | cba18617dfcd15229f8f6831f8182befc0892f40 | |
| parent | c510053c1e610a855c5990f3fd18571745d198a2 (diff) | |
add bib
| -rw-r--r-- | _bibliography/big-data.bib | 53 | ||||
| -rw-r--r-- | chapter/8/big-data.md | 28 |
2 files changed, 66 insertions, 15 deletions
diff --git a/_bibliography/big-data.bib b/_bibliography/big-data.bib index f32cfbc..599b3c9 100644 --- a/_bibliography/big-data.bib +++ b/_bibliography/big-data.bib @@ -1,3 +1,11 @@ +@inproceedings{armbrust2015spark, + title={Spark sql: Relational data processing in spark}, + author={Armbrust, Michael and Xin, Reynold S and Lian, Cheng and Huai, Yin and Liu, Davies and Bradley, Joseph K and Meng, Xiangrui and Kaftan, Tomer and Franklin, Michael J and Ghodsi, Ali and others}, + booktitle={Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data}, + pages={1383--1394}, + year={2015}, + organization={ACM} +} @article{bu2010haloop, title={HaLoop: efficient iterative data processing on large clusters}, @@ -54,6 +62,26 @@ organization={ACM} } +@inproceedings{ghemawat2003google, + title={The Google file system}, + author={Ghemawat, Sanjay and Gobioff, Howard and Leung, Shun-Tak}, + booktitle={ACM SIGOPS operating systems review}, + volume={37}, + number={5}, + pages={29--43}, + year={2003}, + organization={ACM} +} + +@inproceedings{hindman2011mesos, + title={Mesos: A Platform for Fine-Grained Resource Sharing in the Data Center.}, + author={Hindman, Benjamin and Konwinski, Andy and Zaharia, Matei and Ghodsi, Ali and Joseph, Anthony D and Katz, Randy H and Shenker, Scott and Stoica, Ion}, + booktitle={NSDI}, + volume={11}, + pages={22--22}, + year={2011} +} + @inproceedings{isard2007dryad, title={Dryad: distributed data-parallel programs from sequential building blocks}, author={Isard, Michael and Budiu, Mihai and Yu, Yuan and Birrell, Andrew and Fetterly, Dennis}, @@ -96,7 +124,14 @@ publisher={Hindawi Publishing Corporation} } - +@inproceedings{shvachko2010hadoop, + title={The hadoop distributed file system}, + author={Shvachko, Konstantin and Kuang, Hairong and Radia, Sanjay and Chansler, Robert}, + booktitle={2010 IEEE 26th symposium on mass storage systems and technologies (MSST)}, + pages={1--10}, + year={2010}, + organization={IEEE} +} @online{WinNT, author = {Tarau, Paul}, @@ -125,6 +160,22 @@ year={2010}, organization={IEEE} } +@inproceedings{vavilapalli2013apache, + title={Apache hadoop yarn: Yet another resource negotiator}, + author={Vavilapalli, Vinod Kumar and Murthy, Arun C and Douglas, Chris and Agarwal, Sharad and Konar, Mahadev and Evans, Robert and Graves, Thomas and Lowe, Jason and Shah, Hitesh and Seth, Siddharth and others}, + booktitle={Proceedings of the 4th annual Symposium on Cloud Computing}, + pages={5}, + year={2013}, + organization={ACM} +} +@inproceedings{xin2013graphx, + title={Graphx: A resilient distributed graph system on spark}, + author={Xin, Reynold S and Gonzalez, Joseph E and Franklin, Michael J and Stoica, Ion}, + booktitle={First International Workshop on Graph Data Management Experiences and Systems}, + pages={2}, + year={2013}, + organization={ACM} +} @inproceedings{yu2008dryadlinq, title={DryadLINQ: A System for General-Purpose Distributed Data-Parallel Computing Using a High-Level Language.}, diff --git a/chapter/8/big-data.md b/chapter/8/big-data.md index 600ca24..209a3ad 100644 --- a/chapter/8/big-data.md +++ b/chapter/8/big-data.md @@ -71,7 +71,7 @@ MapReduce runs on hundreds or thousands of unreliable commodity machines, so the Many a analytics workloads like K-means, logistic regression, graph processing applications like PageRank, shortest path using parallel breadth first search require multiple stages of map reduce jobs. In regular map reduce framework like Hadoop, this requires the developer to manually handle the iterations in the driver code. At every iteration, the result of each stage T is written to HDFS and loaded back again at stage T+1 causing a performance bottleneck. The reason being wastage of network bandwidth, CPU resources and mainly the disk I/O operations which are inherently slow. In order to address such challenges in iterative workloads on map reduce, frameworks like Haloop {% cite bu2010haloop --file big-data %}, Twister {% cite ekanayake2010twister --file big-data %} and iMapReduce {% cite zhang2012imapreduce --file big-data %} adopt special techniques like caching the data between iterations and keeping the mapper and reducer alive across the iterations. ### 1.1.2 FlumeJava -FlumeJava was introduced to make it easy to develop, test, and run efficient data-parallel pipelines. FlumeJava represents each dataset as an object and transformation is invoked by applying methods on these objects. It constructs an efficient internal execution plan from a pipeline of MapReduce jobs, uses deferred evaluation and optimizes based on plan structures. The debugging ability allows programmers to run on the local machine first and then deploy to large clusters. +FlumeJava {%cite chambers2010flumejava --file big-data %}was introduced to make it easy to develop, test, and run efficient data-parallel pipelines. FlumeJava represents each dataset as an object and transformation is invoked by applying methods on these objects. It constructs an efficient internal execution plan from a pipeline of MapReduce jobs, uses deferred evaluation and optimizes based on plan structures. The debugging ability allows programmers to run on the local machine first and then deploy to large clusters. *Core Abstraction* - `PCollection<T>`, a immutable bag of elements of type `T` @@ -86,11 +86,11 @@ The state of each `PCollection` object is either *deferred* (not yet computed) a ### 1.1.3 Dryad -Dryad is a more general and flexible execution engine that execute subroutines at a specified graph vertices. Developers can specify an arbitrary directed acyclic graph to combine computational "vertices" with communication channels (file, TCP pipe, shared-memory FIFO) and build a dataflow graph. Compared with MapReduce, Dryad can specify an arbitrary DAG that have multiple number of inputs/outputs and support multiple stages. Also it can have more channels and boost the performance when using TCP pipes and shared-memory. But like writing a pipeline of MapReduce jobs, Dryad is a low-level programming model and hard for users to program, thus a more declarative model - DryadLINQ was created to fill in the gap. It exploits LINQ, a query language in .NET and automatically translates the data-parallel part into execution plan and passed to the Dryad execution engine. Like MR, writing raw Dryad is hard, programmers need to understand system resources and other lower-level details. This motivates a more declarative programming model: DryadLINQ - a querying language. +Dryad is a more general and flexible execution engine that execute subroutines at a specified graph vertices. Developers can specify an arbitrary directed acyclic graph to combine computational "vertices" with communication channels (file, TCP pipe, shared-memory FIFO) and build a dataflow graph. Compared with MapReduce, Dryad can specify an arbitrary DAG that have multiple number of inputs/outputs and support multiple stages. Also it can have more channels and boost the performance when using TCP pipes and shared-memory. But like writing a pipeline of MapReduce jobs, Dryad is a low-level programming model and hard for users to program, thus a more declarative model - DryadLINQ {%cite yu2008dryadlinq --file big-data %} was created to fill in the gap. It exploits LINQ, a query language in .NET and automatically translates the data-parallel part into execution plan and passed to the Dryad execution engine. Like MR, writing raw Dryad is hard, programmers need to understand system resources and other lower-level details. This motivates a more declarative programming model: DryadLINQ - a querying language. ### 1.1.4 Spark -Spark is a fast, in-memory data processing engine with an elegant and expressive development interface which enables developers to efficiently execute machine learning, SQL or streaming workloads that require fast iterative access to datasets. Its a functional style programming model (similar to DryadLINQ) where a developer can create acyclic data flow graphs and transform a set of input data through a map - reduce like operators. Spark provides two main abstractions - distributed in-memory storage (RDD) and parallel operations (based on Scala’s collection API) on data sets high performance processing, scalability and fault tolerance. +Spark {%cite zaharia2010spark --file big-data %} is a fast, in-memory data processing engine with an elegant and expressive development interface which enables developers to efficiently execute machine learning, SQL or streaming workloads that require fast iterative access to datasets. Its a functional style programming model (similar to DryadLINQ) where a developer can create acyclic data flow graphs and transform a set of input data through a map - reduce like operators. Spark provides two main abstractions - distributed in-memory storage (RDD) and parallel operations (based on Scala’s collection API) on data sets high performance processing, scalability and fault tolerance. *Distributed in-memory storage - Resilient Distributed Data sets :* @@ -146,18 +146,18 @@ Non-programmers like data scientists would highly prefer SQL like interface over Sawzall {% cite pike2005interpreting --file big-data%} is a programming language built on top of MapReduce. It consists of a *filter* phase (map) and an *aggregation* phase (reduce). User program can specify the filter function, and emits the intermediate pairs to external pre-built aggregators. -Apart from Sawzal, Pig and Hive are the other major components that sit on top of Hadoop framework for processing large data sets without the users having to write Java based MapReduce code. +Apart from Sawzal, Pig {%cite olston2008pig --file big-data %} and Hive {%cite thusoo2009hive --file big-data %} are the other major components that sit on top of Hadoop framework for processing large data sets without the users having to write Java based MapReduce code. -Hive {% cite thusoo2009hive --file big-data %} is built by Facebook to organize dataset in structured formats and still utilize the benefit of MapReduce framework. It has its own SQL-like language: HiveQL which is easy for anyone who understands SQL. Hive reduces code complexity and eliminates lots of boiler plate that would otherwise be an overhead with Java based MapReduce approach. It has a component called *metastore* that are created and reused each time the table is referenced by HiveQL like the way traditional warehousing solutions do. The drawback to using Hive is programmers have to be familiar with basic techniques and best practices for running their Hive queries at maximum speed as it depends on the Hive optimizer. Hive requires developers train the Hive optimizer for efficient optimization of their queries. +Hive is built by Facebook to organize dataset in structured formats and still utilize the benefit of MapReduce framework. It has its own SQL-like language: HiveQL {%cite thusoo2010hive --file big-data %} which is easy for anyone who understands SQL. Hive reduces code complexity and eliminates lots of boiler plate that would otherwise be an overhead with Java based MapReduce approach. It has a component called *metastore* that are created and reused each time the table is referenced by HiveQL like the way traditional warehousing solutions do. The drawback to using Hive is programmers have to be familiar with basic techniques and best practices for running their Hive queries at maximum speed as it depends on the Hive optimizer. Hive requires developers train the Hive optimizer for efficient optimization of their queries. Relational interface to big data is good, however, it doesn’t cater to users who want to perform - ETL to and from various semi or unstructured data sources. - advanced analytics like machine learning or graph processing. -These user actions require best of both the worlds - relational queries and procedural algorithms. Pig Latin and Spark SQL bridges this gap by letting users to seamlessly intermix both relational and procedural API. Both the frameworks free the programmer from worrying about internal execution model by providing implicit optimization on the user input DAG of transformations. +These user actions require best of both the worlds - relational queries and procedural algorithms. Pig Latin {% cite olston2008pig --file big-data%} and Spark SQL {% cite armbrust2015spark --file big-data%} bridges this gap by letting users to seamlessly intermix both relational and procedural API. Both the frameworks free the programmer from worrying about internal execution model by providing implicit optimization on the user input DAG of transformations. -Pig Latin {% cite olston2008pig --file big-data%} aims at a sweet spot between declarative and procedural programming. For advanced programmers, SQL is unnatural to implement program logic and Pig Latin wants to dissemble the set of data transformation into a sequence of steps. This makes Pig more verbose than Hive. +Pig Latin aims at a sweet spot between declarative and procedural programming. For advanced programmers, SQL is unnatural to implement program logic and Pig Latin wants to dissemble the set of data transformation into a sequence of steps. This makes Pig more verbose than Hive. SparkSQL though has the same goals as that of Pig, is better given the Spark exeuction engine, efficient fault tolerance mechanism of Spark and specialized data structure called Dataset. @@ -166,7 +166,7 @@ The following subsections will discuss Hive, Pig Latin, SparkSQL in details. ### 1.2.1 Hive/HiveQL -Hive is a data-warehousing infrastructure built on top of the map reduce framework - Hadoop. The primary responsibility of Hive is to provide data summarization, query and analysis. It supports analysis of large datasets stored in Hadoop’s HDFS. It supports SQL-Like access to structured data which is known as HiveQL (or HQL) as well as big data analysis with the help of MapReduce. These SQL queries can be compiled into map reduce jobs that can be executed be executed on Hadoop. It drastically brings down the development time in writing and maintaining Hadoop jobs. +Hive is a data-warehousing infrastructure built on top of the map reduce framework - Hadoop. The primary responsibility of Hive is to provide data summarization, query and analysis. It supports analysis of large datasets stored in Hadoop’s HDFS {% cite shvachko2010hadoop --file big-data%}. It supports SQL-Like access to structured data which is known as HiveQL (or HQL) as well as big data analysis with the help of MapReduce. These SQL queries can be compiled into map reduce jobs that can be executed be executed on Hadoop. It drastically brings down the development time in writing and maintaining Hadoop jobs. Data in Hive is organized into three different formats : @@ -193,7 +193,7 @@ INSERT INTO, UPDATE, and DELETE are not supported which makes it easier to handl Hive implements the LazySerDe as the default SerDe. It deserializes rows into internal objects lazily so that the cost of Deserialization of a column is incurred only when it is needed. Hive also provides a RegexSerDe which allows the use of regular expressions to parse columns out from a row. Hive also supports various formats like TextInputFormat, SequenceFileInputFormat and RCFileInputFormat. ### 1.2.2 Pig Latin -The goal of Pig Latin is to attract experienced programmers to perform ad-hoc analysis on big data. Parallel database products provide a simple SQL query interface, which is good for non-programmers and simple tasks, but not in a style where experienced programmers would approach. Instead such programmers prefer to specify single steps and operate as a sequence. +The goal of Pig Latin {% cite olston2008pig --file big-data%} is to attract experienced programmers to perform ad-hoc analysis on big data. Parallel database products provide a simple SQL query interface, which is good for non-programmers and simple tasks, but not in a style where experienced programmers would approach. Instead such programmers prefer to specify single steps and operate as a sequence. For example, suppose we have a table urls: `(url, category, pagerank)`. The following is a simple SQL query that finds, for each suciently large category, the average pagerank of high-pagerank urls in that category. @@ -223,7 +223,7 @@ output = FOREACH big_groups GENERATE ### 1.2.3 SparkSQL : -The major contributions of Spark SQL are the Dataframe API and the Catalyst. Spark SQL intends to provide relational processing over native RDDs and on several external data sources, through a programmer friendly API, high performance through DBMS techniques, support semi-structured data and external databases, support for advanced analytical processing like machine learning algorithms and graph processing. +The major contributions of Spark SQL {% cite armbrust2015spark --file big-data%} are the Dataframe API and the Catalyst. Spark SQL intends to provide relational processing over native RDDs and on several external data sources, through a programmer friendly API, high performance through DBMS techniques, support semi-structured data and external databases, support for advanced analytical processing like machine learning algorithms and graph processing. ***Programming API*** @@ -270,7 +270,7 @@ BSP model is a message passing synchronous model where - A notable feature of the model is the complete control on data through communication between every processor at every superstep. Though similar to map reduce model, BSP preserves data in memory across supersteps and helps in reasoning iterative graph algorithms. The graph-parallel abstractions allow users to succinctly describe graph algorithms, and provide a runtime engine to execute these algorithms in a distributed nature. They simplify the design, implementation, and application of sophisticated graph algorithms to large-scale real-world problems. Each of these frameworks presents a different view of graph computation, tailored to an originating domain or family of graph algorithms. However, these frameworks fail to address the problems of data preprocessing and construction, favor snapshot recovery over fault tolerance and lack support from distributed data flow frameworks. The data-parallel systems are well suited to the task of graph construction, and are highly scalable. However, suffer from the very problems mentioned before for which the graph-parallel systems came into existence. -GraphX is a new computation system which builds upon the Spark’s Resilient Distributed Dataset (RDD) to form a new abstraction Resilient Distributed Graph (RDG) to represent records and their relations as vertices and edges respectively. RDG’s leverage the RDD’s fault tolerance mechanism and expressivity. +GraphX {%cite xin2013graphx --file big-data%} is a new computation system which builds upon the Spark’s Resilient Distributed Dataset (RDD) to form a new abstraction Resilient Distributed Graph (RDG) to represent records and their relations as vertices and edges respectively. RDG’s leverage the RDD’s fault tolerance mechanism and expressivity. How does GraphX improve over the existing graph-parallel and data flow models ? The RDGs in GraphX provides a set of elegant and expressive computational primitives through which many a graph parallel systems like Pregel, PowerGraph can be easily expressed with minimal lines of code. GraphX simplifies the process of graph ETL and analysis through new operations like filter, view and graph transformations. It minimizes communication and storage overhead. @@ -424,14 +424,14 @@ Apache Hadoop is an open-sourced framework that supports distributed processing *Figure is from http://thebigdatablog.weebly.com/blog/the-hadoop-ecosystem-overview* -HDFS forms the data management layer, which is a distributed file system designed to provide reliable, scalable storage across large clusters of unreliable commodity machines. The idea was inspired by GFS paper. Unlike closed GFS, HDFS is open-sourced and provides various libraries and interfaces to support different file systems, like S3, KFS etc. +HDFS forms the data management layer, which is a distributed file system designed to provide reliable, scalable storage across large clusters of unreliable commodity machines. The idea was inspired by GFS{%cite ghemawat2003google --file big-data%}. Unlike closed GFS, HDFS is open-sourced and provides various libraries and interfaces to support different file systems, like S3, KFS etc. To satisfy different needs, big companies like Facebook and Yahoo developed additional tools. Facebook's Hive, as a warehouse system, can provide more declarative programming interface and translate to Hadoop jobs. Yahoo's Pig platform is an ad-hoc analysis tool that can structurize HDFS objects and support operations like grouping, joining and filtering. ***Spark Ecosystem*** -Apache Spark's rich-ecosystem constitutes of third party libraries like Mesos/Yarn and several major components that have been already discussed in this articlelike Spark-core, SparkSQL, GraphX. +Apache Spark's rich-ecosystem constitutes of third party libraries like Mesos{%cite hindman2011mesos --file big-data%}/Yarn{%cite vavilapalli2013apache --file big-data%} and several major components that have been already discussed in this articlelike Spark-core, SparkSQL, GraphX. In this section we will discuss the remaining yet very important components/libraries which help Spark deliver high performance. <figure class="main-container"> @@ -445,7 +445,7 @@ Spark achieves fault tolerant, high throughput data streaming workloads in real- *Apache Mesos* -Apache Mesos is an open source cluster/resource manager developed at the University of California, Berkley and used by companies such as Twitter, Airbnb, Netflix etc. for handling workloads in a distributed environment through dynamic resource sharing and isolation. It aids in the deployment and management of applications in large-scale clustered environments. Mesos abstracts node allocation by combining the existing resources of the machines/nodes in a cluster into a single pool and enabling fault-tolerant elastic distributed systems. Variety of workloads can utilize the nodes from this single pool voiding the need of allocating specific machines for different workloads. Mesos is highly scalable, achieves fault tolerance through Apache Zookeeper and is a efficient CPU and memory-aware resource scheduler. +Apache Mesos{%cite hindman2011mesos --file big-data%} is an open source cluster/resource manager developed at the University of California, Berkley and used by companies such as Twitter, Airbnb, Netflix etc. for handling workloads in a distributed environment through dynamic resource sharing and isolation. It aids in the deployment and management of applications in large-scale clustered environments. Mesos abstracts node allocation by combining the existing resources of the machines/nodes in a cluster into a single pool and enabling fault-tolerant elastic distributed systems. Variety of workloads can utilize the nodes from this single pool voiding the need of allocating specific machines for different workloads. Mesos is highly scalable, achieves fault tolerance through Apache Zookeeper and is a efficient CPU and memory-aware resource scheduler. *Alluxio/Tachyon* |
