import React, { Component } from "react";
import {
    Accordion,
    AccordionItem,
    AccordionItemHeading,
    AccordionItemButton,
    AccordionItemPanel,
} from 'react-accessible-accordion';

// Demo styles, see 'Styles' section below for some notes on use.
import 'react-accessible-accordion/dist/fancy-example.css';

class DataAnalytics extends Component {

    openUrl(value){
        window.open(value, false);
      }
      
      
  render() {
    return (
      <div className="pad-top-30 pad-left-page">
          <div class="row">
            <div class="col-sm-7 col-md-5">
                  <div class="">
                    <h2>Data Analytics</h2>                    
                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                    Kinesis
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                                
                                <br/>
                                <b>Kinesis Stream </b> <br/>- real time process or analyze streaming data for specialized needs. define shard. more scalable . <br/>
                                    - Kinesis PutRecord to insert data into Kinesis in real time since it sends its data synchronously and does not have the processing delay. <br/>
                                        To handle failure of PutRecord, AWS recommends using Error Retries and Exponential Backoff in AWS<br/>
                                    - Kinesis Data Streams Producer Library is not meant to be used for real-time processing of event data since it buffers the data and <br/>
                                        can incur an additional processing delay of up to RecordMaxBufferedTime within the library 	 <br/>
                                        KPL PutRecords automatically adds any failed records back into the KPL buffer so it can be retried <br/>
                                    - ability to replay the data as well access to the same data to multiple Kinesis client applications<br/>
                                    - Security data at rest: Enable server-side encryption<br/>
                                    - enhanced fan-out - enables consumers to receive records from a stream with throughput of up to 2 MB of data per second per shard. <br/>
                                        This dedicated throughput provides consumers don't have to contend with other consumers while receiving data from the stream. <br/>
                                        This is a push mechanism. Consumers don't need to poll for data  Kinesis Data Streams pushes data records from the stream to consumers that use enhanced fan-out. <br/>
                                    - KCL uses a unique Amazon DynamoDB table to keep track of the application's state. <br/>
                                        Because the KCL uses the name of the Amazon Kinesis Data Streams application to create the name of the table, each application name must be unique.<br/>
                                        If the KDS application receives provisioned-throughput exceptions, increase the provisioned throughput for the DynamoDB table. <br/>
                                        By default KCL creates the table with a provisioned throughput of 10 reads per second and 10 writes per second<br/>
                                    - Each Kinesis shard has 1 MiB of data per second or 1000 records in write capacity and 5 read transactions in read capacity. <br/>
                                        Attempting to exceed these limits results in throttling and your producers and consumers will not be able to carry out their operations<br/>
                                        Increase the number of shards in the stream using the UpdateShardCount API.<br/>
                                        Choose partition keys in a way that results in a uniform record distribution across shards.<br/>
                                    - NOTE: Kinesis Data Streams do not stream data directly to S3 or Redshift<br/>
                                    <br/>

                                    <img styleName="width:30%" src="https://d1.awsstatic.com/Products/product-name/diagrams/product-page-diagram_Amazon-Kinesis-Data-Streams.074de94302fd60948e1ad070e425eeda73d350e7.png"></img>
                                    

                                    <br/><br/>
                                    <b>Kinesis Firehose </b> <br/>- near real time analytics (s3/redshift/elastic/splunk)  <br/>
                                    - can capture, transform, and deliver streaming data to Amazon S3, Amazon Redshift, Amazon Elasticsearch Service, generic HTTP endpoints, and service providers like Datadog, New Relic, MongoDB, and Splunk.<br/>
                                    - It can also batch, compress, transform, and encrypt your data streams before loading, minimizing the amount of storage used and increasing security<br/>
                                    - ex: ad-hoc business analytics queries on well-structured data with high velocity (can use with redshift to ad-hoc query)<br/>
                                    - Amazon Kinesis Data Firehose can convert the format of your input data from JSON to Apache Parquet or Apache ORC before storing the data in Amazon S3. <br/>
                                    Parquet and ORC are columnar data formats that save space and enable faster queries compared to row-oriented formats like JSON. <br/>
                                    If you want to convert an input format other than JSON, such as comma-separated values (CSV) or structured text, <br/>
                                    you can use AWS Lambda to transform it to JSON first<br/>
                                    <br/><br/>
                                    <b>Kinesis analytics </b> <br/>- perform analysis on the streaming data<br/>
                                    - Continuous Query can provide the ability to monitor, query and generate alerts on a stream. <br/>
                                        Continuous Query is a query over a stream executes continuously over streaming data. <br/>
                                        This continuous execution enables scenarios, such as the ability for applications to continuously query a stream and generate alerts<br/>
                                    - Stagger windows query aggregates data using keyed time-based windows that open as data arrives. <br/>
                                        The keys allow for multiple overlapping windows. This is the recommended way to aggregate data using time-based windows 	<br/>
                                    - Tumbling Windows query, A query that aggregates data using distinct time-based windows that open and close at regular intervals.  suitable when a windowed query processes each window in a non-overlapping manner<br/>
                                    - Sliding windows query aggregates data continuously, using a fixed time or row count interval<br/>
                                    - By using the built-in RANDOM_CUT_FOREST function in Kinesis Data Analytics, you can detect anomalies in real time with the sensor data that is stored in Kinesis Data Streams. RANDOM_CUT_FOREST is also an appropriate algorithm for many other kinds of anomaly-detection use cases—for example, the media sentiment example mentioned earlier in this post.<br/>

                                    <br/><br/>
                                    <img src="https://docs.aws.amazon.com/kinesisanalytics/latest/dev/images/kinesis-app.png"></img>

                                    <br/><br/>
                                    <img src="https://d2908q01vomqb2.cloudfront.net/b6692ea5df920cad691c20319a6fffd7a4a766b8/2020/02/26/KinesisElasticsearch1.jpg"></img>


                                    <br/><br/>
                                    <b>Amazon Kinesis Video Streams </b> <br/>-  makes it easy to securely stream video from connected devices to AWS for analytics, <br/>
                                    machine learning (ML), and other processing. <br/>
                                    Kinesis Video Streams automatically provisions and elastically scales all the infrastructure needed to ingest streaming video data <br/>
                                    from millions of devices. <br/>
                                    It also durably stores, encrypts, and indexes video data in your streams, and allows you to access your data through easy-to-use APIs. <br/>
                                    Kinesis Video Streams enables you to quickly build computer vision and ML applications through integration with Amazon Rekognition Video and libraries for ML frameworks <br/>
                                    such as Apache MxNet, TensorFlow, and OpenCV.<br/>
                                    Kinesis Video Streams is ideal for building computer vision-enabled ML applications that are becoming prevalent in a wide range of <br/>
                                    use cases such <br/>
                                    Smart Home - With Kinesis Video Streams, you can easily stream video and audio from camera-equipped home devices such as baby monitors, <br/>
                                    webcams, and home surveillance systems to AWS. You can then use the streams to build a variety of smart home applications ranging from <br/>
                                    simple video playback to intelligent lighting, climate control systems, and security solutions. <br/>
                                    No need for transforming the data to HLS, if using Kinesis Video Streams<br/>
                                    combination can be used to be more resilient and faster. Kinesis with a worker to process the data received from the Kinesis stream<br/>

                                    <br/><br/>
                                    <img src="https://d2908q01vomqb2.cloudfront.net/b6692ea5df920cad691c20319a6fffd7a4a766b8/2019/02/01/ClickstreamSessionsKinesisGlueAthena3.png"></img>

                                    <br/><br/>
                                
                                    <b>usecases:</b>
                                    batch consumption of stream data: -- Kinesis Data Firehose -- S3 -- Glue -- S3 Data Lake -- Athena  <br/>
                                    real-time consumption of data: -- Kinesis Data Firehose -- Kinesis Data Analytics -- Kinesis Data Firehose -- Redshift -- QuickSight <br/>
                                    streaming transactional data with RDS  RDS -- Database Migration Service -- S3 -- Glue -- S3 Data Lake -- Athena    

                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>    
                    <div className="pad-top-10"></div>
                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                Amazon MSK
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                                creating your streaming applications without having to worry about the operational overhead of managing your Apache Kafka environment <br/>
                                provisioning, configuration, and maintenance of Apache Kafka clusters and Apache ZooKeeper nodes<br/>
                                Amazon Managed Streaming for Kafka to scale the brokers<br/><br/>
                                Apache Kafka/Amazon MSK allows you to process streaming data. <br/>
                                It guarantees the correct order of delivery of your data messages, but it uses the “at-least-once” delivery method. <br/>
                                At-least-once delivery means that the message will not be lost, but the message may be delivered to a consumer more than once.<br/>
                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  

                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                IOT Streaming
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                                IoT rules - cost effective near real time. can help evaluate and send notifications when the peak thresholds are exceeded <br/>
                                - WS IoT rules engine listens for incoming MQTT messages that match a rule.  <br/>
                                - When a matching message is received, the rule takes some action with the data in the MQTT message  <br/>
                                - for example, writing data to an Amazon S3 bucket, invoking a Lambda function, or sending a message to an Amazon SNS topic <br/>

                                <br/>
                                <img src="https://docs.aws.amazon.com/iot/latest/developerguide/images/what-is-aws-iot.png"></img>
                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  

                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                Amazon Redshift
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                                fully managed data warehousing solution providing standard SQL interface and ability to run complex queries <br/>
                                
                                <b>Loading data from Amazon S3</b>: <br/>
                                    - Splitting your data into multiple files <br/>
                                    - Uploading files to Amazon S3 <br/>
                                    - Using the COPY command to load from Amazon S3 <br/>

                                <b>loading data</b>: <br/>
                                - temporary staging tables to upload data/perform a merge (upsert)  <br/>
                                - COPY command with the manifest file to load data into Amazon Redshift (data in parallel from Amazon S3, Amazon EMR, Amazon DynamoDB, or multiple data sources on remote hosts) <br/>

                                <button  className="linkbutton" onClick={() => this.openUrl('https://docs.aws.amazon.com/redshift/latest/dg/c_loading-data-best-practices.html')}>Best Practices</button>
                                

                                <br/>
                                <b>COPY</b> - command to optimize performance, maximum amount of parallelism - split the files into smaller files (ideal size is between 1 MB and 125 MB after compression) <br/>
                                number of files should be a multiple of the number of slices in your cluster. <br/>
                                single large file, Amazon Redshift is forced to perform a serialized load, which is much slower <br/>

                                <b>NOLOAD</b> - checks the integrity of all of the data without loading it into the database.  <br/>
                                The NOLOAD option displays any errors that would occur if you had attempted to load the data <br/>

                                <b>UNLOAD ENCRYPTED </b>-  <br/>
                                command automatically stores the data encrypted using-client side encryption and uses HTTPS to encrypt the data during the transfer to S3 <br/>
                                UNLOAD automatically creates files using Amazon S3 server-side encryption with AWS-managed encryption keys (SSE-S3).  <br/>
                                You can also specify server-side encryption with an AWS Key Management Service key (SSE-KMS) or client-side encryption with a customer-managed key (CSE-CMK) <br/>

                                If you want to validate your data without actually loading the table, use the NOLOAD option with the COPY command. <br/>

                                Redshift does not allow encrypting existing cluster using HSM and there needs to be trust connection established between Redshift and HSM <br/>
                                <button  className="linkbutton" onClick={() => this.openUrl('https://docs.aws.amazon.com/redshift/latest/mgmt/working-with-db-encryption.html')}>Working with DB Encrption</button>
                                

                                <br/>
                                <b>workload management (WLM) </b>, each query is prioritized equally, which can cause a person, team, or workload to consume excessive cluster resources  <br/>
                                for a process which isn’t as valuable as other more business-critical jobs. <br/>
                                - separation of business concerns  <br/>
                                    - Interactive- Short-running, Long-running <br/>
                                    - Batch - bulk INSERT, UPDATE, and DELETE transactions, for example, ETL or ELT programs <br/>
                                - ex: Place the largest customers into a single user group with a dedicated query queue and place the rest of the customers into a different query queue <br/>
                                
                                <button  className="linkbutton" onClick={() => this.openUrl('https://docs.aws.amazon.com/redshift/latest/dg/cm-c-implementing-workload-management.html')}>Implementing Workload Management</button>
                                
                                You can configure up to eight queues with each queue having a maximum concurrency level of 50. <br/>

                                <br/>
                                <b>Redshift Enhanced VPC Routing</b> supports the use of standard VPC features such as VPC Endpoints, security groups, network ACLs,  <br/>
                                managed NAT and internet gateways, enabling you to tightly manage the flow of data between your Amazon Redshift cluster and all of your data sources <br/>
                                <b>VPC Endpoints</b> <br/>
                                - For traffic to an Amazon S3 bucket in the same region as your cluster, you can create a VPC endpoint to direct traffic directly to the bucket.  <br/>
                                - When you use VPC endpoints, you can attach an endpoint policy to manage access to Amazon S3.  <br/>
                                <b>NAT gateway</b>  <br/>
                                - To connect to an Amazon S3 bucket in another region  <br/>
                                - or to another service within the AWS network,  <br/>
                                - or to access a host instance outside the AWS network,  <br/>

                                <b>Star and snowflake schema data warehouse designs</b> - organize around a central fact table that contains measurements for a specific event, such as a sold item.  <br/>
                                The fact table has foreign key relationships to one or more dimension tables that contain descriptive attribute information for the sold item, such as customer or product.  <br/>
                                Snowflake schemas extend the star concept by further normalizing the dimensions into multiple tables <br/>
                                distribution keys is a good way to optimize the performance of Amazon Redshift when you use a star schema <br/>

                                In a typical star schema, the fact table has foreign key relationships with multiple dimension tables, so you need to choose one of the dimensions.  <br/>
                                You would choose the foreign key for the largest frequently joined dimension as a distribution key in the fact table and the primary key in the dimension table.  <br/>
                                Make sure that the distribution keys chosen result in relatively even distribution for both tables, and if the distribution is skewed, use a different dimension.  <br/>
                                Then analyze the remaining dimensions to determine if a distribution style of ALL, KEY, or EVEN is appropriate <br/>

                                <b>distribution strategy for a star schema </b><br/>
                                - to reduce cross-node traffic. <br/>
                                - to balance data distribution and collocation data <br/>
                                - to take advantage of data locality on a local node for joins and aggregates <br/> <br/>

                                <b>EVEN </b>distribution distributes the data across slides in a round robin fashion, regardless of the values in any particular column <br/>
                                and does not participate in joins <br/>
                                <b>KEY</b> Distribution If the data needs to be grouped by specific Key, KEY distribution should be used <br/>
                                ALL distribution ensures that every row is collocated for every join that the table participates in. <br/>

                                <b>COPY</b> data from multiple, evenly sized files.  <br/>
                                - Amazon Redshift is an MPP (massively parallel processing) database, where all the compute nodes divide and parallelize the work of ingesting data.  <br/>
                                Each node is further subdivided into slices, with each slice having one or more dedicated cores, equally dividing the processing capacity.  <br/>
                                The number of slices per node depends on the node type of the cluster.  <br/>
                                For example, each DS2.XLARGE compute node has two slices, whereas each DS2.8XLARGE compute node has 16 slices. <br/>

                                When you load data into Amazon Redshift, you should aim to have each slice do an equal amount of work.  <br/>
                                When you load the data from a single large file or from files split into uneven sizes, some slices do more work than others. <br/>

                                When loading multiple files into a single table, use a single COPY command for the table, rather than multiple COPY commands.  <br/>
                                Amazon Redshift automatically parallelizes the data ingestion.  <br/>
                                Using a single COPY command to bulk load data into a table ensures optimal use of cluster resources, and quickest possible throughput. <br/>

                                <b>DBLink</b> function allows the entire query to be pushed to Amazon Redshift. load near time data <br/>

                                When you launch an Amazon Redshift cluster, you can choose to encrypt it with a master key from the AWS Key Management Service (AWS KMS). AWS KMS keys are specific to a region. If you want to enable cross-region snapshot copy for an AWS KMS-encrypted cluster, you must configure a snapshot copy grant for a master key in the destination region so that Amazon Redshift can perform encryption operations in the destination region. <br/>

                                There are three ways to <b> resize</b> an Amazon Redshift cluster: <br/>
                                    Elastic resize: To quickly add or remove nodes from an existing cluster, use elastic resize. Elastic resize operations usually take a few minutes to complete. The cluster will be unavailable during that time. <br/>
                                    Classic resize: To change the node type, the number of nodes, or both, use classic resize. A classic resize copies tables to a new cluster. The source cluster will be in read-only mode until the resize operation finishes. <br/>
                                    Snapshot, restore, and resize: To keep your cluster available during a classic resize, first make a copy of the existing cluster, then resize the new cluster. Keep in mind that all data written to the source cluster after the snapshot is taken must be manually copied to the target cluster after the migration. <br/>

                                    <br/>

                                <b>snapshot/copy across region with KMS Keys</b> <br/>
                                - AWS KMS keys are specific to an AWS Region.  <br/>
                                - If you enable copying of Amazon Redshift snapshots to another AWS Region,  <br/>
                                - and the source cluster and its snapshots are encrypted using a master key from AWS KMS,  <br/>
                                - you need to configure a grant for Amazon Redshift to use a master key in the destination AWS Region.  <br/>
                                - i.e CreateSnapshotCopyGrant to allow Amazon Redshift to use the KMS key from the destination region. <br/>
                                - This grant enables Amazon Redshift to encrypt snapshots in the destination AWS Region. <br/>
                                - In the source region, enable cross-region replication and specify the name of the copy grant created. <br/>
                                In the source region, enable cross-region replication and specify the name of the copy grant created. <br/>

                                <b>audit logging </b>- logs authentication attempts, connections, disconnections,  each query run against the database (User log and user activity log).  <br/>
                                not enabled by default.  <br/>

                                <b>Redshift spectrum </b>
                                
                                Can  query and retrieve structured and semistructured data from files in Amazon S3 without having to load the data into Amazon Redshift tables. <br/>
                                Redshift Spectrum queries employ massive parallelism to execute very fast against large datasets.  <br/>
                                Redshift Spectrum also scales intelligently <br/>
                                Much of the processing occurs in the Redshift Spectrum layer, and most of the data remains in Amazon S3.  <br/>
                                Multiple clusters can concurrently query the same dataset in Amazon S3 without the need to make copies of the data for each cluster <br/>
                                - ex: use case: existing TB data in redshift, a tonne of data in S3 (ex: for previous years). query combination of these and generate report <br/>
                                -  CREATE EXTERNAL SCHEMA SPECTRUM <br/>

                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  

                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                AWS Glue
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                                Crawl the data with an AWS Glue crawler and update the AWS Glue Data Catalog to reflect the metadata. Then use Amazon Athena to query the transformed data. <br/>

                                - For daily incoming data, use AWS Glue crawlers to scan and identify the schema. use AWS Glue workflows with AWS Glue jobs to perform transformations. <br/>
                                - For archived data, use Amazon EMR to perform data transformations. <br/>

                                - Relationalize PySpark transform can be used to flatten the nested data into a structured format. <br/>

                                <br/>
                                <img src="https://docs.aws.amazon.com/athena/latest/ug/images/glue_architecture.png"></img>

                                <br/>
                                <img src="https://docs.aws.amazon.com/glue/latest/dg/images/PopulateCatalog-overview.png"></img>

                                <br/>
                                <img src="https://docs.aws.amazon.com/athena/latest/ug/images/glue_crawler.png"></img>
                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  

                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                    AWS Athena
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                            does not currently support cross region queries and querying data in Glacier. <br/>
                            can connect to Amazon Athena data sources and use Athena data to create Amazon QuickSight datasets. <br/>
                            Workgroups - Isolate users, teams, applications, or workloads into groups <br/>
                            ex: - unique Athena workgroup for each team, Within the workgroup, enforce encryption for the query results and create tags <br/>

                            <br/> <br/>
                            Athena Federated Queries to query data in sources other than S3. <br/>
                            his provides an easy way to query data without having to manage servers and query only when required. 
                            This would fulfill the ad hoc and cost effective requirements for business cases where single view (like infrastructure) from multiple accounts/vpcs application etc.,<br/>
                            Set up the necessary data connectors to register them as a catalog in Athena. Use Athena to run analyses for the multiple data sources.<br/>
                            
                            Applications can use Amazon Athena since it is a fully-managed service that doesn’t require a long-running database. <br/>
                            For high burst requests. During a period of high usage, with “ClientError: An error occurred (Throttling Exception)” message - Request a quota increase to increase the burst capacity of API calls<br/>

                            <b>Performance tuning tips</b>: Refer 
                            <button  className="linkbutton" onClick={() => this.openUrl('https://aws.amazon.com/blogs/big-data/top-10-performance-tuning-tips-for-amazon-athena/')}>Performance Tuning Tips</button>
                            <br/>
                            1. Partition your data <br/>
                            2. Bucket your data <br/>
                            3. Use Compression <br/>
                            4. Optimize file sizes <br/>
                            5. Optimize columnar data store generation <br/>
                            6. Optimize ORDER BY <br/>
                            7. Optimize joins <br/>
                            8. Optimize GROUP BY <br/>
                            9. Use approximate functions <br/>
                            10. Only include the columns that you need <br/>

                            Optimizing Partition Processing -  <br/>
                            Approach 1: Querying a single partition <br/>
                            Approach #2: Partition Projection <br/>
                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  

                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                    Simple Storage Service - S3
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                            <b>S3 Etags </b>(hash of the object) can be used to verify before and after migration/copy <br/>
                            <b>S3 Transfer Acceleration</b> is ideal with applications already integrated with S3 APIs. <br/> <br/>

                                Enable S3 Transfer Acceleration on the S3 bucket, and configure the application to use the Transfer Acceleration endpoint for uploads. <br/>
                                S3 transfer acceleration does not use Direct Connect, but routes the request through CloudFront. It is not ideal solution for large data transfer as it still used public internet.  <br/>
                                Configure the application to break the larger files into chunks and use a multipart upload to transfer files to Amazon S3.  <br/>
                                Multipart upload is ideal for uploading objects, it does not provide direct integration with hadoop <br/>

                                Depending on the size of the data you are uploading, Amazon S3 offers the following options: <br/>
                                    Upload objects in a single operation—With a single PUT operation, you can upload objects up to 5 GB in size. <br/>
                                    Upload objects in parts—Using the multipart upload API, you can upload large objects, up to 5 TB.The multipart upload API is designed to improve the upload experience for larger objects. You can upload objects in parts. These object parts can be uploaded independently, in any order, and in parallel. You can use a multipart upload for objects from 5 MB to 5 TB in size. <br/>

                                An Amazon S3 Glacier (Glacier) vault can have one resource-based vault access policy and one Vault Lock policy attached to it. A Vault Lock policy is a vault access policy that you can lock. Using a Vault Lock policy can help you enforce regulatory and compliance requirements.  <br/>

                                Fine grained access over S3 objects - consider AWS KMS Grants to provide access to KMS Keys <br/> <br/>
                                <b>Encryption</b>: <br/>
                                    Data at rest encryption using S3 can be implemented using either Server Side or Client Side encryption.  <br/>
                                    SSE can be implemented using either KMS provided keys (SSE-KMS) or Customer provided keys (SSE-C).  <br/>
                                    CSE can be implemented by encrypting the data before uploading it to S3 and then decrypting the data after downloading it from S3 at client side. <br/>

                                <br/>
                                <b>Glacier Select </b><br/>
                                perform filtering operations using simple Structured Query Language (SQL) statements directly on your data in S3 Glacier.  <br/>
                                When you provide an SQL query for a S3 Glacier archive object, S3 Glacier Select runs the query in place and writes the output results to Amazon S3.  <br/>
                                With S3 Glacier Select, you can run queries and custom analytics on your data that is stored in S3 Glacier, without having to restore your data to a hotter tier like Amazon S3 <br/>

                                <br/>
                                AWS Lake Formation is the solution for creating a central data store which you can grant access to databases, tables, and columns to data stored in Amazon S3. You can use Amazon Redshift Spectrum to run queries in Amazon S3.

                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  
                        <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                AWS Database Migration Service (DMS)
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                            a collection architecture - ex:  several relational databases, data warehouses, and NoSQL databases that hold transactional information about their financial trades and operational activities <br/>
                            collection infrastructure is best used for streaming transactional data from existing relational data stores. <br/> 
                            You create a task within the Database Migration Service that collects ongoing changes within your various operational data stores, an approach called ongoing replication or change data capture (CDC). <br/>
                            These changes are streamed to an S3 bucket where a Glue job is used to transform the data and move it to your S3 data lake.
                            
                            <button  className="linkbutton" onClick={() => this.openUrl('https://aws.amazon.com/dms/faqs/')}>DMS</button>
                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  
                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                AWS DataSync
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                                AWS DataSync to deploy a DataSync agent on-premises and replicate the data to a specified Amazon S3 bucket <br/>
                                DataSync provide an alternate option to S3 Transfer Acceleration. S3 Transfer Acceleration is ideal with applications already integrated with S3 APIs. <br/>
                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  

                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                    Snowball
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                            When you're using an Edge device, the data migration process has the following stages: <br/>
                            You use the AWS Schema Conversion Tool (AWS SCT) to extract the data locally and move it to an Edge device. <br/> 
                            You ship the Edge device or devices back to AWS. <br/>
                            After AWS receives your shipment, the Edge device automatically loads its data into an Amazon S3 bucket. <br/>
                            AWS DMS takes the files and migrates the data to the target data store. If you are using change data capture (CDC), those updates are written to the Amazon S3 bucket and then applied to the target data store. <br/>

                        Order an AWS Snowball device and copy the database by using the AWS Schema Conversion Tool.  <br/>
                        When the data is available in Amazon S3, use AWS DMS to load it to Amazon RDS, and configure a job to synchronize changes before the cutover.  <br/>

                        
                        <button  className="linkbutton" onClick={() => this.openUrl('https://docs.aws.amazon.com/dms/latest/userguide/CHAP_LargeDBs.html')}>Large DBs</button>
                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  

                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                    EMR
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                                Parallelize tasks across cluster and do mpp <br/>

                                A Streaming application reads input from standard input and then runs a script or executable (called a mapper) against each input.  <br/>
                                The result from each of the inputs is saved locally, typically on a Hadoop Distributed File System (HDFS) partition.  <br/>
                                After all the input is processed by the mapper, a second script or executable (called a reducer) processes the mapper results.  <br/>
                                The results from the reducer are sent to standard output.  <br/>
                                You can chain together a series of Streaming steps, where the output of one step becomes the input of another step. <br/>
                                
                                <button  className="linkbutton" onClick={() => this.openUrl('https://docs.aws.amazon.com/emr/latest/ReleaseGuide/UseCase_Streaming.html')}>UseCase Streaming</button>

                                <br/>
                                <b>S3DistCp </b>can be used to aggregate smaller files to large ones without any change to the existing applications and processes. <br/> 
                                Hadoop is optimized for reading a fewer number of large files rather than many small files, whether from S3 or HDFS.  <br/>
                                You can use S3DistCp to aggregate small files into fewer large files of a size that you choose, which can optimize your analysis for both performance and cost <br/>

                                Transform the unstructured data using Amazon EMR and generate CSV data. COPY the CSV data into the analysis schema within Redshift <br/>

                                <br/>
                                <b>best practice</b>:  <br/>
                                - master and core nodes as on-demand and use spot instances for the task nodes  <br/>
                                - use instance fleets with a provisioning timeout for the core nodes. <br/>
                                - For unpredictable workloads, the suggested pricing model is Spot or On-Demand. <br/>
                                - SSE-KMS and LUKS can be used for implemented encryption at rest.  <br/>
                                - In addition to HDFS encryption, the Amazon EC2 instance store volumes and the attached Amazon EBS volumes of cluster instances are encrypted using LUKS <br/>
                                - zip is not ideal compression for files larger than 1GB  <br/>
                                - compression technique should be checked with supports splitting like bzip2 or one with higher compression handling like Snappy <br/>
                                - use Data partition to skip loading/filtering the rrun against entire dataset <br/>

                                <b>Encryption </b><br/> 
                                SSE-S3 (amazon s3 manages) or SSE-KMS (you setup AWS KMS customer master key and policies) <br/>
                                SSE with customer-provided keys (SSE-C) is not available for use with Amazon EMR. <br/>
                                Local disk encryption can be enabled as part of a security configuration to encrypt root and storage volumes <br/>
                                Note: EBS root volume cannot be detached, with the cluster running <br/>

                                <img src="https://docs.aws.amazon.com/emr/latest/ManagementGuide/images/emr-encryption.png"></img>

                                <br/>
                                <b>EC2 instances</b> <br/>
                                C - compute intensive; R - memory intensive. ad-hoc queries; G - mpp/ml etc.,; T - general purpose; I & D -storage optimized <br/>

                                AWS per-second Billing - more cost efficient and performant to use more instances for shorter amount of time <br/>
                                Optional feature available when using Amazon EMR. Turn on EMRFS consistent view when configuring the EMR cluster. <br/>
                                
                                <b>EMRFS </b>would help to store the data into S3, which provides data durability also the ability to scale as per the demand.  <br/>
                                EMR File System (EMRFS) is an implementation of HDFS that all Amazon EMR clusters use for reading and writing regular files from Amazon EMR directly to Amazon S3. EMRFS provides the convenience of storing persistent data in Amazon S3 for use with Hadoop while also providing features like consistent view and data encryption. <br/>


                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  

                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                    All Other Open Source Tools
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                                
                            <b>Apache Spark </b>
                            
                            Distributed processing framework and programming model that helps you do machine learning, stream processing, or <br/>
                            graph analytics using Amazon EMR clusters. Similar to Apache Hadoop, <br/>
                            Spark is an open-source,distributed processing system commonly used for big data workloads. <br/>
                            However, Spark has several notable differences from Hadoop MapReduce. <br/>
                            Spark has an optimized directed acyclic graph (DAG) execution engine and actively caches data in-memory, <br/>
                            which can boost performance, especially for certain algorithms and interactive queries. <br/>

                            <b>Presto </b>
                            open source distributed sql<br/>
                            
                            Petabytes of data with the interactive ability<br/>
                            Presto allows ad hoc query analysis over multiple data sources like Redshift, MySQL, Hive on EMR and PostgreSQL.<br/>
                            non relational HDFS), Amazon S3, Cassandra, MongoDB, and HBase can also be queried. <br/>
                            Presto can query data where it is stored, without needing to move data into a separate analytics system. <br/>
                            Query execution runs in parallel over a pure memory-based architecture, with most results returning in seconds.<br/>
                            performance is much better than Pig as it uses a custom query execution engine <br/>

                            <b>Oozie</b> is a schedular tool<br/>

                            <b>Hive</b>
                            
                            Open-source, data warehouse, and analytic package that runs on top of a Hadoop cluster. <br/>
                            Hive scripts use an SQL-like language called Hive QL (query language) that abstracts programming models and <br/>
                            supports typical data warehouse interactions. <br/>
                            Hive enables you to avoid the complexities of writing Tez jobs based on directed acyclic graphs (DAGs) or <br/>
                            MapReduce programs in a lower level computer language, such as Java. <br/>
                            Hive extends the SQL paradigm by including serialization formats. <br/>

                            <b>Pig</b> 
                            is based on Map Reduce execution and more ideal for batch processing<br/>

                            <b>Ganglia</b> 
                            is a scalable distributed monitoring system for high-performance computing systems such as clusters and Grids  while minimizing the impact on their performance.<br/>
                            

                            <b>Apache Flink </b>
                            is a streaming dataflow engine that you can use to run real-time stream processing on high-throughput data sources. <br/>
                            use case: use flink with kafka/MSK to stream data. similar to kinesis stream<br/>

                            <b>Hue (Hadoop User Experience) </b>
                            is an open-source, web-based, graphical user interface for use with Amazon EMR and Apache Hadoop. <br/>

                            <b>Apache Sqoop </b>
                            is a tool for transferring data between Amazon S3, Hadoop, HDFS, and RDBMS databases. <br/>
                            

                            <b>Apache Phoenix </b>
                            is used for OLTP and operational analytics<br/>
                            
                            allowing you to use standard SQL queries and JDBC APIs to work with an Apache HBase backing store. <br/>

                            <b>YARN</b> is the resource management and job scheduling technology in the open source Hadoop distributed processing framework.<br/>
                            <br/>

                            <b>HBase </b>- key-value storage as the database
                            
                            HBase is an open source, non-relational, distributed database developed as part of the Apache Software Foundation's Hadoop project. <br/>
                            HBase runs on top of Hadoop Distributed File System (HDFS) to provide non-relational database capabilities for the Hadoop ecosystem.<br/>
                            HBase works seamlessly with Hadoop, sharing its file system and serving as a direct input and output to the MapReduce framework and execution engine. <br/>
                            HBase also integrates with Apache Hive, enabling SQL-like queries over HBase tables, joins with Hive-based tables, and support <br/>
                            for Java Database Connectivity (JDBC). <br/>

                            <b>HCatalog </b>
                            
                            is a tool that allows you to access Hive metastore tables within Pig, Spark SQL, and/or custom MapReduce applications. <br/>
                            HCatalog has a REST interface and command line client that allows you to create tables or do other operations. <br/>

                            <b>Apache Ranger</b>
                            
                            is a framework to enable, monitor, and manage comprehensive data security across the Hadoop platform. <br/>
                            Features include a centralized security administration, fine-grained authorization across many Hadoop components <br/>
                            (Hadoop, Hive, HBase, Storm, Knox, Solr, Kafka, and YARN) and central auditing. <br/>
                            It uses agents to sync policies and users, and plugins that run within the same process as the Hadoop component, for example, NameNode.<br/>

                            <b>Jupyter Notebook </b><br/>
                            
                            is an open-source web application that you can use to create and share documents<br/> 
                            that contain live code, equations, visualizations, and narrative text<br/>
                            can connect with sources built out of S3, SQL databases, MongoDB, Redis, RDS, other file systems<br/>

                            <b>Apache Zeppelin </b>
                            
                            is a web-based notebook that enables data-driven, interactive data analytics and collaborative documents with SQL, Scala and more<br/>
                            Apache Zeppelin interpreter concept allows any language/data-processing-backend to be plugged into Zeppelin. <br/>
                            Currently Apache Zeppelin supports many interpreters such as Apache Spark, Python, JDBC, Markdown and Shell.<br/>
                            Some basic charts are already included in Apache Zeppelin. Visualizations are not limited to SparkSQL query, <br/>
                            any output from any language backend can be recognized and visualized.<br/>

                            <b>Apache DistCp (S3DistCp)</b>
                            
                            is an open-source tool you can use to copy large amounts of data. <br/>
                            S3DistCp is an extension of DistCp that is optimized to work with AWS, particularly Amazon S3. <br/>
                            The command for S3DistCp in Amazon EMR version 4.0 and later is s3-dist-cp, which you add as a step in a cluster or at the command line. <br/>
                            Using S3DistCp, you can efficiently copy large amounts of data from Amazon S3 into HDFS where it can be processed by subsequent steps in your Amazon EMR cluster. <br/>
                            You can also use S3DistCp to copy data between Amazon S3 buckets or from HDFS to Amazon S3. <br/>
                            S3DistCp is more scalable and efficient for parallel copying large numbers of objects across buckets and across AWS account<br/>

                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  

                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                Quicksight - cloud powered business intelligence service
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                            QuickSight can provide visualization with out of box integration with Redshift and S3 JSON documents as well as handle Excel files<br/>
                            QuickSight allows you to directly connect to and import data from a wide variety of cloud and on-premises data sources. <br/>
                            These include SaaS applications such as Salesforce, Square, ServiceNow, Twitter, Github, and JIRA; <br/>
                            3rd party databases such as Teradata, MySQL, Postgres, and SQL Server; native AWS services such as Redshift, Athena, S3, RDS, and Aurora; <br/>
                            and private VPC subnets. You can also upload a variety of file types including Excel, CSV, JSON, and Presto<br/>
                            Files in Amazon S3 that have been compressed with zip, or gzip, can be imported as-is. <br/>
                            If you used another compression program for files in Amazon S3, or if the files are on your local network, unzip them before importing them<br/>
                            Use a story option to preserve multiple iterations of an analysis and play the iterations sequentially.<br/>
                            <br/>
                            Datasets created using Amazon S3 as the data source are automatically imported into SPICE. <br/>
                            The Amazon QuickSight console allows for the refresh of SPICE data on a schedule. <br/>
                            ability to automatically update SPICE (Super-fast, Parallel, In-memory, Calculation, Engine) datasets in Amazon QuickSight based on a schedule. You can specify the Time Zone, the Time of Day, the Frequency (Daily, Weekly, or Monthly), and the Starting Date for your SPICE dataset refresh. This capability eliminates the manual process of updating SPICE datasets and allows you to easily share dashboards with up-to-date information. Scheduled refresh is now available in Amazon QuickSight for all supported AWS, cloud and on-premises data sources, for both new and existing datasets in all regions.<br/>

                            <br/><br/>
                            <b>Supports following formats.</b> <br/><br/>
                                CSV/TSV – Delimited text files<br/>
                                ELF/CLF – Extended and common log format files<br/>
                                JSON – Flat or semistructured data files<br/>
                                XLSX – Microsoft Excel files<br/>

                                <br/><br/>
                            
                            <button  className="linkbutton" onClick={() => this.openUrl('https://docs.aws.amazon.com/quicksight/latest/user/working-with-visual-types.html')}>Working with Visual Types</button>
                            <br/><br/>
                            Bar Chart    - can be used to represent the data for comparison in "x" for each "y". ex: sales for each region.<br/>
                            Histogram    - similar to bar but for continuous data, where the bins represent ranges of data, while a bar chart is a plot of categorical variables.<br/>
                            Pie charts   - compare parts of a whole i.e compare values for items in a dimension. They DO NOT show changes over time. <br/>
                            Line graphs  - track changes over short and long periods of time<br/>
                            Column chart  - each category is represented by a rectangle, with the height of the rectangle being proportional to the values being plotted<br/>
                            line charts  - to compare changes in measure values over period of time <br/>
                            Scatter plot - two-dimensional data visualization. Dots represent the values obtained for two different variables - one plotted along the x-axis and the other plotted along the y-axis.  visualize two or three measures for a dimension. <br/>
                            Stacked area - an extension of a basic area chart to display the evolution of the value of several groups on the same graphic<br/>
                            Heat maps    - display data in a tabular fashion. if you want to identify trends and outliers, because the use of color makes these easier to spot. <br/>
                            pivot tables - display data in a tabular fashion. if you want to analyze data on the visual. show measure values for the intersection of two dimensions.<br/>

                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  

                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                Elasticsearch - provides full text search capability and is a fully managed AWS service
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                            load streaming data into your Amazon Elasticsearch Service domain from many different sources. <br/>
                            Some sources, like Amazon Kinesis Data Firehose and Amazon CloudWatch Logs, have built-in support for Amazon ES. <br/>
                            Others, like Amazon S3, Amazon Kinesis Data Streams, and Amazon DynamoDB, use AWS Lambda functions as event handlers. <br/>
                            The Lambda functions respond to new data by processing it and streaming it to your domain.<br/>
                            Ingest and index the content into an Amazon Elasticsearch domain<br/>

                            
                            <button  className="linkbutton" onClick={() => this.openUrl('https://docs.aws.amazon.com/elasticsearch-service/latest/developerguide/es-aws-integrations.html')}>AWS Integrations</button>
                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  

                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                AWS CloudFront
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                                clickstream batch analysis for large global data only once per week  <br/>
                                Gigabit-Scale HTTP(S) global request distribution service and works fine with peaks higher than 10 Gbps or 15,000 RPS. <br/>
                                It can handle scale, geo-spread, spikes, and unpredictability. Access Logs will contain the GET data. EMR can be used for batch analysis <br/>
                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  

                    <Accordion>
                        <AccordionItem>
                            <AccordionItemHeading>
                                <AccordionItemButton>
                                CloudWatch
                                </AccordionItemButton>
                            </AccordionItemHeading>
                           
                            <AccordionItemPanel>
                                CloudWatch Logs agent to consolidate logs into a single CloudWatch Logs group.
                            </AccordionItemPanel>
                        </AccordionItem>
                    </Accordion>  
                      
                </div>              
            </div>
           
          </div>
      </div>
    );
  }
}
 
export default DataAnalytics;