import React, { Component } from "react";
import Accordion from "@material-ui/core/Accordion";
import AccordionSummary from "@material-ui/core/AccordionSummary";
import AccordionDetails from "@material-ui/core/AccordionDetails";

class MLBase extends Component {

    openUrl(value){
        window.open(value, false);
      }
      
      
  render() {
    return (
        <div className="pad-top-30 pad-left-page">
        <div class="row">
         <div class="col-sm-7 col-md-5">
               <div className="">
                 <h2>AWS Certified Machine Learning – Specialty</h2>  <br/>      
                       
                   The idea for below was to document and reference purpose only. Please make sure to refer AWS Documenation and other materials.<b>There is no compression algorithm for experience!</b><br/><br/>
                   References:<br/>

                   - <a className="blogMenuText" onClick={() => this.openUrl('https://aws.amazon.com/training/learning-paths/machine-learning/exam-preparation')}>AWS ML Exam Preparation!</a><br/>
                   - <a className="blogMenuText" onClick={() => this.openUrl('https://docs.aws.amazon.com/machine-learning/latest/dg/what-is-amazon-machine-learning.html')}>AWS ML Dcoumentation!</a><br/>
                   - <a className="blogMenuText" onClick={() => this.openUrl('https://aws.amazon.com/sagemaker/faqs/')}>SageMaker Faqs</a><br/>
                   - <a className="blogMenuText" onClick={() => this.openUrl('https://aws.amazon.com/sagemaker/groundtruth/faqs/')}>GroundTruth Faqs</a><br/>
                   - <a className="blogMenuText" onClick={() => this.openUrl('https://jayendrapatil.com/aws-certified-machine-learning-specialty-mls-c01-exam-learning-path/')}>jayendrapatil.com</a><br/>
                   - <a className="blogMenuText" onClick={() => this.openUrl('https://whizlabs.com')}> whizlabs </a>The course video and labs are exemplary. Do make sure to do practice exam<br/>
                   - <a className="blogMenuText" onClick={() => this.openUrl('https://www.udemy.com/course/aws-machine-learning/')}> Udemy </a>By stefane Maarek and Frank Jane. Thumbs Up!!!<br/>
                   - <a className="blogMenuText" onClick={() => this.openUrl('https://www.examtopics.com/exams/amazon/aws-certified-machine-learning-specialty/')}>examtopics.com practice exams</a><br/>
                   
                   - AWS FAQs for individual products/services (ex: below)<br/>
                   - rekognition, translate, transcribe, comprehend, forecast, fraud-detector, polly, lex, textract faqs<br/>


                   <br/><br/>
                   
                       <img width="50%"  src="https://d1.awsstatic.com/SageMaker/4_how_it_works_diagram_jan29-rev%402x.5387bf546e908574c995b3a1bc13709d18b5e81a.png"></img>
                   
                   <br/><br/>
                       <Accordion>
                           <AccordionSummary>
                              <b>Data Science Basics</b>
                              
                           </AccordionSummary>
                           <AccordionDetails>
                           <div>
                            Kinds of data<br/>
                            - numerical<br/>
                            - categorical<br/>
                            - ordinal<br/>
                                kind of data will dictate what model we will use<br/><br/>

                                Numerical - quantifiable thing we can measure. ex: height, pageload time, price<br/><br/>
                                - Discrete data - integer base. how many purchases, how many time did i get "heads"<br/>
                                - Continuous data - . they describe the quantity and not the quality. infinite number of possible values. ex: checkout time, height of a person (can be a random number with decimal leading to infinite possibilities)<br/><br/>

                                Categorical - qualitative data tha has no inheritent mathematical meaning<br/>
                                - you can assign numbers (choose a number) to categories to represent them more compactly BUT those numbers dont have any mathematical meaning<br/>
                                - made up of a set of categories<br/>
                                - they describe the quality not quantity<br/>
                                - they are distinct <br/>
                                ex: gender, yes/no (binary data), residence states, product category <br/><br/>

                                Ordinal - combination of numerical and categorical<br/>
                                - ex: movie rating - 1 - 5 scale (ratings must be 1,2,3,4,5). ex: 1 star is lesser than 5<br/>

                                <br/><br/>

                            Data Distributions<br/><br/>

                            - characterising data to fall into a specific range<br/>

                                Normal Distribution - For continious Data.<br/>
                                gives a probability of a data point falling within some range of a given value<br/>
                                note there can be an infinite possibility of a number occuring in a range<br/>
                                - closer to the mean the probability is higher. farther on either sides probability is lower<br/>
                                - Probability density function - solid curve that describes a probability of range of values that happens in continuous data<br/>
                                - for a given range probability of a number occuring within a range. <br/><br/>

                                Poisson Distribution - series of events with success or failure. ex: possibility of homes that can be sold based on series of events<br/>
                                deals with discrete data (whole value)<br/>
                                ex: how many calls a call center receives, how many mail we receive on a given day. you will never get a half a caller, half letter or half home sold in a day<br/>
                                For discrete data, Probability Mass Function - probabilities of discrete values occuring in the data set<br/><br/>


                                Binomial Distribution - based on events with yes/no, 0 or 1, true/false kind of questions<br/>
                                - ex: flipping a coin will result in a "heads" or "tails"<br/><br/>


                                Bernoulli Distribution - special case of binomial distribution<br/>
                                - can think of a binomial distribution as the sum of bernoulli distribution<br/>
                                - has a single trial (n=1)<br/><br/>
                                

                            Time Series Analysis<br/>

                                Trends<br/>
                                - overall trend over time. ex: inflation over a period of time<br/><br/>

                                Seasonality<br/>
                                - peak of a disease over a specific time (ex: spring/fall allergies)<br/><br/>

                                Both - trend and seasonality<br/>

                                Noise - some variations just around in nature<br/>

                                Additive Model - ex: seasonal variation is constant. <br/>
                                time series = seasonality + trends + noise<br/><br/>

                                Multiplicative model - seasonal variatio increases as the trend increases<br/>
                                time series = seasonality * trends * noise - trends<br/><br/>
                            </div>

                           </AccordionDetails>
                       </Accordion>    
                       
                       <Accordion >
                           <AccordionSummary>
                               <b>Data Engineering</b>
                              
                           </AccordionSummary>
                           <AccordionDetails>
                               <div>
                               Datasets <br/><br/>
                                - a collection of data used as the "fuel" for our ML models. ex: kaggle open/free datasets for food/medicine/vaccinations/temperature around the world<br/><br/>

                                Features<br/>
                                - columns in the dataset. ex: latitude, longitude in a map dataset. feature describes provides attributes <br/>
                                - Categorical features (quality) vs Continuous features (quantity)<br/>
                                - Corpus Data - data collected from text. data from newspaper, wikipedia. Used in natural language processing (NLP). speech recognition/text-to speech<br/>
                                - Ground truth data <br/><br/>
                                
                                Observations<br/>
                                - rows in a dataset. each observation provides details about that data. ex:  a city row on the map dataset<br/><br/>

                                Dataset types<br/>
                                - can be comma separated (csv), JSON<br/>
                                - can also be images, audio or video<br/>
                                - structured data - has schema. a table of values such as relational database<br/>
                                - unstructured data - ex: pdfs, images, video, audio, logs, tweets<br/>
                                - semi structured data - contains tags to separate semantic elements/enfoce hierachies. ex: csv, json, xml et.,<br/>
                                - labeled data - has a target attribute<br/>
                                - unlabeled data - has no target attribute. ex: sound byte data<br/>
                                - supervised learning - develop predictive model on both input and output data - mostly used with labeled data. either "classification" or "regression"<br/>
                                - unsupervised learning - group and interpret data based on only input data - mostly used with unlabeled data - "clustering" <br/><br/>


                                Data engineering<br/><br/>

                                ML Cycle<br/>
                                - create your data - produce data, engineer data<br/>
                                - produce your model - train & test the model, evaluate the model<br/>
                                - execute your model - monitor and evaluate, execute model<br/><br/>

                                Steps in ML model<br/>
                                1) Gather your data - can be uniqie, publicly available (kaggle, reddit, google dataset, UCI ML repository, scikit-learn.org datasets), html scrape<br/>
                                2) Handle missing data - null value replacement, mode/median/average replacement, remove entire record, model based imputation (KNN, regression etc), Interpolation/Extrapolation, forward filling and backward filling, hot deck<br/>
                                3) feature extraction - AKA dimensionality reduction<br/>
                                4) feature selection - rank based on importance, remove less important, use PCA (principal component analysis), encode categorical data to integers<br/>
                                5) encode categorical values - encode categorical data to integers, one hot encoding - introduce as columns with 0 or 1 values<br/>
                                6) numeric feature engineering - transform numeric values, change values as on same scale (normalization rescals in a range 0,1), Standardization - rescale data to have a mean of 0 and a standard deviation of 1 (unit variance), binning (bucketize into categories)<br/>
                                7) split the model into training and testing - generally 80%, 20% split<br/><br/>

                                Data analysis<br/><br/>

                                Modeling<br/>

                                algorithms<br/><br/>

                                Implementations and Operations<br/>


                                Data storage - <br/>

                                S3. <br/>
                                - storage tiers (standard to glacier)<br/>
                                - security - iam role, policies, bucket policies<br/><br/>

                                Streams<br/>
                                Kinesis <br/>
                                -  Data - realtime<br/>
                                - firehose - near realtime to S3, elasticsearch, splunk<br/>
                                - analytics - query realtime<br/>
                                - video - video stream data<br/><br/>

                                Glue <br/>
                                - catalog - crawl data in s3, rds/redshift (jdbc)<br/>
                                - glue job - using spark - python or scala<br/>
                                - Use ML duplicate rows removal feature<br/>
                                - random cut forest <br/>
                                - hot/dense data <br/><br/>

                                Athena - serverless query. data formats (csv, json , parquet, avro, avro) using glue crawler/catalog<br/>
                                - structured , semi structured, unstructured data<br/>
                                - presto under the hood<br/>
                                - integrates with jupyter, zeppelin, RStudio noteboos<br/>
                                - integration with s3, quicksightt, redshift.<br/>
                                - odbc/jdbc integration with other tools<br/><br/>

                                Quicksight - cloud powered business analytics service<br/>
                                - build visualizations<br/>
                                - perform adhoc analysis<br/>
                                - serverless<br/>
                                - integrations - s3(excel, csv, tsv), redshift, aurora/rds, athena, ec2,  jdbc/odbc (salesforce)<br/>
                                - Data preparations allows limited etl<br/>
                                - row level security, eni/direct connect capability, MFA auth<br/>

                                    SPICE - super fast parallel in memory calculation engine<br/>
                                    - columnar storage in memory machine code generation<br/>
                                    - scales to hunders of thousands <br/>
                                    - adhoc exploration, dashboards<br/>
                                    - stories - guided tours through specific views of analysis<br/><br/>

                                    ML powered -<br/>
                                    - Anomoly detecton (Random cut forest)<br/>
                                    - forecasting (seasonality etc)<br/>
                                    - auto-narratives (way to build dashboard in plain language)<br/><br/>

                                    antipattern<br/>
                                    - high formatted canned reports (use quick sight for ad hoc queries, analysis and visualization)<br/>
                                    - etl (use glue. eventhough quicksight can provide limited etl capabilities)<br/>

                                Data pipelines - rds/dynamodb to s3<br/>
                                DMS - mysql to postgres etc.,<br/>
                                Step Functions - orchestrate each step <br/><br/>

                               </div>
                            </AccordionDetails>
                       </Accordion>    
                       <Accordion>
                           <AccordionSummary>
                              <b>Feature Engineering</b>
                              
                           </AccordionSummary>
                           <AccordionDetails>
                               <div>
                                        Feature engineering<br/><br/>
                                            - "applied machine learning is basically feature engineering" - Andrew Ng<br/>
                                            - dimensionality - refers to the number of features (input variables) in your dataset<br/>
                                            - applying the knowledge of the data. trim down the features, reduce redundancy<br/>
                                            - "curse of dimensions" - too many features can be problematic<br/>
                                            - large feature (column) to observations (row) ratio can worsen the training time<br/>
                                            - create better features to train your model<br/>
                                            - there can be new features from existing data. <br/>
                                            - several features and combine into one to reduce the mathematically<br/>
                                            - there is no one practice fits all.<br/>
                                            - there will be missing data, redundant data<br/>
                                            - NOTE: using feature selection (filter redundant/removal) and extraction (like PCA - principle component analysis) can reduce dimensionality. ex: reduce from 4 to 2 features<br/>
                                            - feature selection - ex: "Variance thresholds" can reduce irrelevant column (from 3 columns to 2)<br/><br/>
                                            - feature extraction - standardization (mean of zero) and normalization will create a new smaller set of features that stills captures most of the information<br/>
                                                - PCA is unsupervised algorithm that creates new features by linearly combining original features<br/>
                                                - new features are uncorrelated meaning they are orthogonal<br/>
                                                - new features are ranked in order of "explained variance" (tells you how much information - variance can be attributed to principal components)<br/>
                                                - first PC1 explains the most variance of the dataset, PC2 explains the second most variance etc.,<br/>
                                                - you lose some of the variance (information) when you reduce your dimensional space<br/>
                                                - can be used to assist in visualization of your data<br/>
                                                - PCA can also assist in speeding up your machine learning<br/>
                                                

                                            - ex: how much money people make in stock market<br/>
                                            - feature in this case can be - age of the person, address, some of them are relevant and some of them may not be. car they own, where they live, house<br/>
                                            - too many variables will be added into dimension. ex: age, weight, height, sex, address, city, what car they own<br/><br/>


                                            note:<br/>
                                            - too many features can be a problem. - leads to sparse data<br/>
                                            - keep what features that matter the most.<br/><br/>

                                            PCA - principal component analysis. take aways different features (unnecessary) and keep simpler/minimal dimensions<br/>
                                            K-Means - clustering. another algorithm to keep minimal/required dimensions.<br/>

                                            <br/><br/>

                                            Imputation - Missing data<br/>

                                            Null values - several approaches to handle missing data<br/>
                                                - do nothing. ex ignore them as LigGBM does with its "use_missing=false" parameter<br/>
                                                - remove entire record (observations). - risk losing data points with valuable information<br/>
                                                - Mode/median/average value replacement - works for non categorical (numerical) values<br/>
                                                - most frequent values<br/>
                                                - model based imputation - preprocess with a model/pipeline algorithm to handle missing values (K nearest neighbors, regression, deep learning)<br/>
                                                - interpolation/extrapolation<br/>
                                                - forward filling or backward filling<br/>
                                                - hot deck imputation<br/><br/>


                                                Mean replacement<br/>
                                                - if a column is missing/empty, take the mean of all the other column and replacement. can be null, NaN, empty/blank <br/>
                                                - advantage: overall doesnt affect the mean this way<br/>
                                                - note: overall it is not the best. in general very fast and easy<br/>
                                                - but generally its not the best choice for the imputation. if there is an age and income it can completely miss<br/>
                                                - it cannot be replaced on a categorical data. <br/>
                                                - one step better than null removal. but doesnt consider any correlation between features<br/><br/>

                                                Median replacement<br/>
                                                - if there are more outliers, median replacement is better. <br/>

                                                ex: at times for missing values, replacement of mean can skew the numbers. in a case where for a missing "income" column if there are too many millionaires the mean will skew the overall number<br/>
                                                This becomes an outlier. in this case median replacement is a better option<br/><br/>

                                                Multiple Imputation<br/>
                                                - to handle missing data<br/>
                                                - instead of substituting a single value for each mssing data, the missing values are replaced with a set of plausible values which contain the natural variability and uncertainity of the right values<br/>
                                                - use case multi column dataset with one column missing 30% of its data<br/><br/>

                                                Dropping<br/>
                                                - quick and easier<br/>
                                                - note: overall not the best<br/>
                                                - dropping a few rows doesnt introduce a bias then its a good idea<br/>
                                                - ex: too many similar rows are having a column with empty/null row<br/>
                                                - other options include replacing similar value instead of dropping completely instead of dropping (especially in production data)<br/><br/>

                                                Machine Learning<br/>

                                                    KNN - K Nearest neighbors<br/><br/>
                                                    - find "k" . most similar rows, average their values<br/>
                                                    - assumes numerical and not categorical<br/>
                                                    - handle categorical data (Hamming distance) but categorical data is probably better served by deep learning model<br/>
                                                    - uses 'feature similarity' to predict missing values<br/><br/>

                                                    Deep Learning - build machine learning model to impute data for your ML model<br/>
                                                    - works for categorical and non-numerical<br/>
                                                    - very complicated, lot of code may be implemented<br/><br/>

                                                    Regression<br/>
                                                    - regression analysis is a set of statistical processes for estimating the relationships between a dependent variable <br/>
                                                    (often called the 'outcome variable') and one or more independent variables (often called 'predictors', 'covariates', or 'features')<br/>
                                                    - Regression analysis is a form of predictive modelling technique which investigates the relationship between a dependent (target) and independent variable (s) (predictor). <br/>
                                                    This technique is used for forecasting, time series modeling and finding the causal effect relationship between the variables<br/>
                                                    - ex: game application which has few years of data (season, release etc). if we are to predict read/write capacity in advance<br/>
                                                    - regression model can help predict or forecast data based on earlier dataset<br/>
                                                    - predictors of the variable with missing values identified via correlation matrix<br/>
                                                    - best predictors are selected and used as independent variables in a regression equation<br/>
                                                    - variable with missing data is used as the target variable<br/><br/>
                                                    
                                                    Multiple regression<br/>
                                                    - find linear or non linear using missing feature and other features<br/>
                                                    - MICE - multiple imputations by chained equation - Most advanced technique. Probably the best feature in ML<br/><br/>

                                                    Just get More data<br/>
                                                    - if there are more missing data, best/ideal way is to get more data<br/><br/>
                                                    
                                                    Interpolation/Extrapolation<br/>
                                                    - estimate valuies from other observations within the range of a discrete set of known data points<br/>
                                                
                                                    Forward filling/backward filling<br/>
                                                    - fill the missing value by filling it from the preceding or succeeding value<br/><br/>
                                                    
                                                    Hot deck imputation<br/>
                                                    - randomly choosing the missing value from a set of related and similar variables<br/><br/>
                                                    
                                            

                                            Encoding Categorical Values<br/><br/>

                                                Binarizer encoding<br/>
                                                - for yes/no, true/false feature<br/>
                                                - replace male/female columns with 0 or 1<br/><br/>
                                                Label Encoding<br/>
                                                - ex: for an enum of values. ex "for work class - private, public, business etc." returns 0, 1, 2 etc.,<br/>
                                                - note the value generated by this number eventhought doesn't have any meaning it can/may be interpreted by the system with values/weight<br/>
                                                - ordinal encoder or one hot encoding will avoid ordinality<br/><br/>
                                                One hot encoding<br/>
                                                - convert categorical values into columns. <br/>
                                                - example male/female values can be introduced with two new columns representing male and female with 0/1 values respectively<br/>
                                                - NOTE: this can explode the number of columns. ex: country value split into each individual column (per country)<br/>
                                                - in those exploding scenario, label encoding is a better option<br/><br/>


                                            Handling Unbalanced Data<br/>
                                            - if there discrepancy between "positive" and "negative" data. ex: fraud data between training and actual data<br/>
                                            - especially when there is no actual data. <br/>
                                            - "positive" does not mean good. positive means "is this the actual case i am testing"<br/><br/>

                                            Oversampling<br/>
                                            - duplicate samples from minority class. fabricate/copies <br/>
                                            - can be done at random<br/><br/>

                                            Undersampling<br/>
                                            - remove more positive samples or majority case. <br/>
                                            - ex: remove negative ones (throwing data away - specifically avoid some scaling issues)<br/>
                                            - note:usually not the best approach<br/><br/>

                                            SMOTE - synthetic minority over sampling technique<br/>
                                            - smote sampling technique uses the k-nearest neighbors algorithm to create synthetic observations to balance a training data set<br/>
                                            - artificially creates new samples of minority class using nearest neighbors. Run K-nearest neighbors on each sample, create a new sample from KNN result (mean of neighbors)<br/>
                                            - generates new samples and undersamples majority class<br/>
                                            - generate better than just oversampling<br/>
                                            - usecase: preprocess data to balance it. ex: before observations from one country has imbalanced samples when compared to others<br/><br/>

                                            GANs - Generative Adversarial Network technique<br/>
                                            - generates uniqueue observations that more closely resemble the real minority observation set<br/>
                                            - these observations are exact replica of existing minority class observations, making them less effective than observations created by other techniqueus that produce uniqueu synthetic observations<br/>
                                            - usecase: to detect fraudulent transactions. where transactions set has 99.99% non fraudulent transactions. fraudulent observations are minority<br/><br/>


                                            Adjusting thresholds<br/>
                                            - have a threshold of probability to point to point majority case.<br/>
                                            - if there are too many false positives increas the threshold<br/>
                                            - reduces false positives but could result in more false negatives<br/>
                                            - be careful about raising threshold as necessary. example cost of false positive vs false negative<br/>
                                            - ex: x,y,z happens then there is a fraud transaction (predicting fraud/not fraud). <br/><br/>

                                            
                                            Outliers<br/>

                                                Variance (signa square) - average of the squared differences from the mean<br/>
                                                - ex: number of people voting in a minute every hour <br/>
                                                every hour 				- 1,2,3,2<br/>
                                                Mean 						- (1+2+3+2)/4 = 2<br/>
                                                Differences from the mean - (-1, 0, 1, 0)<br/>
                                                Squared difference        - (1,  0, 1,  0)<br/>
                                                Variance                  - (1 + 0 + 1 + 0) / 4 = 0.5<br/><br/>

                                                Standard Deviation - square root of variance<br/>
                                                - way to identify outliers.<br/>
                                                - data points that lie more than one standard deviation from the mean can be considerd unusual<br/>
                                                - how many standard deviation away from the mean<br/>
                                                - Ex: above example Square root of 0.5 = 0.707106781<br/><br/>


                                                Dealing with outliers<br/>
                                                - sometimes remove outliers from training data. be careful doing this (ex: not enough reviews data on feedback data)<br/>
                                                - only throw if its not consistent with the model<br/>
                                                - standard deviation can suggest. data points more than some multiple of a standard deviation in the training data<br/>
                                                - AWS algorithm - random cut forest algorithm - founds within Quicksitght, kinesis analytics, sagemaker and more<br/><br/>
                                            
                                            
                                            Binning  <br/>
                                            - discretization or quantization<br/>
                                            - categorical binning - group categorical values to gain insight into data. ex: countries by region<br/>
                                            - numerical binning - divides continuous feature into a specified numer of categories or bins, thus making the data discrete. reduce the number of discrete intervals<br/>
                                            - quantile binning - categorize/distribute data in even size on each bin. define the bins using percentiles based on distribution of data. helps to discover non linearity in the variable's distribution by grouping observed values together.<br/>
                                            - take numerical data into categorical<br/>
                                            - put a group of ages (in 20, 30 into one/each classification) - bucketize into a category<br/><br/>

                                            - why <br/>
                                            - there could be uncertainity in the measurements. ex: 22.25 or 22.80 on an age may not provide value to a feature. binning can be an option<br/>
                                            - transform numerical data to an ordinal data. ex: "stars" on the review for each number (1-5). <br/><br/>
                                            
                                            Transforming - apply some function to a feature to make it more suited for training<br/>
                                            - getting new feature by transforming. ex: youtube transforamtion use case with square and root of x.<br/>
                                            - allows learning of super and sub-linear functions<br/>
                                            ex: log transformation for something exponential trend<br/><br/>

                                            Encoding - tranform data into some new representation required by the model<br/><br/>
                                            One hot encoding<br/>
                                            - create "buckets" for every category<br/>
                                            - very common in deep learning. categories are represented by individual output "neurons".<br/>
                                            - ex: identify if a specific number exist. bucket for your category if exist will be marked as "1" and all others have a "0"<br/><br/>


                                            Scaling and Normalization<br/>
                                            - needs data to be normally distributed around zero. <br/>
                                            - atleast the feature data to be scaled to comparable values<br/>
                                            - features with larger magnitudes will have more weight than they should. ex: income data. modeling age and income features - income will be much higher than ages<br/>
                                            - "Scikit_learn" - preprocessor module that helps "MinMaxScaler"<br/>
                                            - remember to scale your results back up<br/><br/>


                                            Shuffling - shuffle data by eliminating by products and randomize the data collected to improve the quality<br/><br/>

                                            Orthogonal Sparse Bigram<br/>
                                            - usecase: to aid in text string analysis and is an alternative to the bi-gram transformation (n-gram with window size 2)<br/>
                                            - creates groups of words that always include the first word<br/>
                                            - creates groups of words of size n, returns every pair of words that includes the first word<br/><br/>


                                            Normalization Transformation<br/>
                                            - normalization prevents the variables with large range differences from dominating the ML model<br/>
                                            - normalization transformer normalizes numeric values to have a mean of zero and variance of one<br/>
                                            - usecase: where variables with a larger magnitude should not dominate the model. apply normalization to ensure each field will have a mean of 0 and variance of 1 to remove any significant magnitude<br/><br/>


                                            TF - IDF<br/>
                                            Term frequency and Inverse Document Frequency - used for search algorithm. how relevent the search for the document  <br/>
                                            how important a word is in a document by giving weights to words that are common and less common in the document<br/><br/>

                                            TF <br/>
                                            - measures how often a word occurs in a document. word that occurs more may be very important<br/><br/>

                                            Document Frequency<br/>
                                            - how often the word occurs in all documents (ex: a word across all common wikipedia document)<br/><br/>

                                            = TF divide by Document Frequency<br/>
                                            
                                            or = TF * Inverse Document Frequency<br/>
                                            
                                            TF = assumes a log of words or a "Bag of words" - collection of words (mostly not the case everytime)<br/>
                                            all of these can contribute for the search. ML can help<br/>
                                            - convert all lower case<br/>
                                            - abbreviations<br/>
                                            - capitalizations<br/>
                                            - misspellings all can contribute<br/><br/>

                                            Bag of Words<br/>
                                            Extension of TF-IDF is not only compute relevancy for individual words (terms) but also<br/>
                                            - Unigram<br/>
                                            - Bigram<br/>
                                            - n-grams<br/>

                                            ex: I love aws exams<br/>
                                            unigram - I, love, aws, exams<br/>
                                            bigrams - i love, love aws, aws exams<br/>
                                            trigrams - i love aws, love aws exams<br/><br/>

                                            Sample IDF question<br/>
                                            
                                                                | I | love | aws | exams | tea | I love | love aws | love tea| aws exams  |
                                            -----------------------------------------------------------------------------------------------
                                            i love aws exams     |   |      |     |       |     |        |          |         |            |
                                            i love tea           |   |      |     |       |     |        |          |         |            |
                                            
                                            <br/>

                                            <br/>
                                            using TF-IDF
                                            <br/>
                                            - search algorithm can compute TF-IDF for every word in a corpus
                                            <br/>
                                            - for each given search word, sort the documents by their TF-IDF score for that word - more like accuracy rate
                                            <br/>
                                            - display the results 

                                            <br/><br/>
                                            NOTE: Exam use cases<br/><br/>

                                            1) finding phrases in spam - "N-Gram" - compare whole phrases such as "click now", "you're a winner", "click here now" or "buy now!" etc.<br/>
                                            2) finding subject of several pdfs - "TF-IDF" - filter less important words in the documents<br/>
                                            3) finding subject of several pdfs - "Orthogonal Sparse Bigram" - find common word combinations repeated in the documents<br/>
                                            4) usecase: to do sentiment analaysis on famous quotes. some of the pre processing that can be done before building the model <br/>
                                                - remove stop words<br/>
                                                - lowercase each word <br/>
                                                - tokenize each word<br/>
                                                - clean sentences by trim leading and trailing spaces<br/>
                                                - remove punctuation, unnecessary characters and reduce duplicate space into  a single space<br/>
                                                - Tokenization is the process of converting text into tokens before transforming it into vectos. it is also easier to filter out unnecesary tokens<br/>
                                                - Stop words are the most commonly occuring words which are not relevant in the context of the data and do not contribute any deeper meaning to the phrase.<br/><br/>

                                    </div>             
                               </AccordionDetails>
                       </Accordion>    
                       
                       <Accordion >
                           <AccordionSummary>
                               <b>Data(exploraratory) Analytics</b>
                              
                           </AccordionSummary>
                           <AccordionDetails>
                               <div>
                               exploratory Data analyics<br/><br/>

                                Python<br/>
                                - Pandas - python library slice and dice data columns<br/>
                                - Dataframe - like a table to manipulate the data<br/>
                                - Series - one dimensional like a row<br/>
                                - Numpy - lower level python library ex: export an array. massage data before exporting to numpy and eventually feed it to ML algorithm<br/><br/>

                                create data from data, handle missing data, work on subset of data etc.,extract random sample of data<br/><br/>

                                Jupiter notebook - code on browser, share, pipeline and sequence of code and rerun repeatedly<br/><br/>

                                ex: matplotlib, numpy, pandas etc<br/><br/>
                                ```<br/>

                                    matplotlib inline<br/>
                                    import numpy as np<br/>
                                    import pandas as pd<br/><br/>

                                    df = pd.read_csv("PastHires.csv")<br/>
                                    df.head()<br/><br/>
                                    
                                    # export 5 rows into a dataframe<br/>
                                    df[['years Experience', 'Hired']][:5]<br/><br/>
                                    
                                    # groups education ex: Masters - 5, Bachelors -10, Doctors - 20<br/>
                                    dc = df['Level of educaation'].value_counts()<br/><br/>
                                    
                                ```	
                                <br/><br/>
                                matplotlib - visualize data distribution and outliers<br/>
                                - whisker plot - mean, middle, outliers visually distributed etc.,<br/>
                                - historgram - range of values continuous values (ex: binning). how many data points lie within a range of 10000 to 120000<br/><br/>

                                Seaborn<br/>
                                    - matplotlib on steroids<br/>
                                    - boxplot - box and whisker plot. more flexibility. several dimenstions<br/><br/>

                                    Heat map<br/>
                                    - colors represent of values at each point. ex: miles per gallon, how many values fall within a specific categories, how datapoints appear in different combination<br/><br/>

                                    Pair plot<br/>
                                    - plots of possible combinations at once<br/>
                                    - series of charts<br/><br/>

                                    Joint plot<br/>
                                    - scatter plot with histogram<br/><br/>
                                    

                                Scikit_learn<br/>
                                - python library for learning ML models<br/>
                                - experiment a subset of data<br/>
                                - pre processing - to scale the data into algorithms<br/><br/>
                                - 

                                Random forest - a collection of decision trees (cascade of decision points. ex: a person with 10 years experience with PHD worked in an hospital). <br/>
                                classifier - any number of algorithms that reads the attributes of data (we will know info about data. x years of experience) and labels (what we are predicting. are they hired)<br/>
                                training set and a test set is created<br/>
                                evaluate a set of test set that was not trained on and figure out how well the algorithm is able to predict on the training<br/><br/>

                                    ```<br/>
                                    from sklearn_ensemble import RandomForestClassifier<br/><br/>
                                    
                                    clf = RandomForeshClassifier(n_estimator=10)<br/>
                                    clf = clf.fit(X, y)<br/><br/>
                                    
                                    # pass feature and predict the label - 1 if candidate is hired<br/>
                                    print(clf.predict([[10, 1, 4, 0, 0, 0]])<br/><br/>
                                    
                                    # pass feature and predict the label - 0 if candidate is not hired<br/>
                                    print(clf.predict([[10, 0, 4, 0, 0, 0]])<br/><br/>
                                    
                                    ```
                                    <br/><br/>
                                    
                                    
                                    
                                Problems predicting<br/><br/>

                                1) Classification - two classes we are predicting ex: hired or not hired. discrete set of classfication<br/>
                                more ubiquitous<br/><br/>

                                2) Regression - specific numerical value. based on existing data for height and weight, predicting someone's height based on their weight<br/><br/>


                                Imputation<br/>
                                - dropping a field/value<br/>
                                - a general technique to drop the columns that have no values. this can be done early in the data preparation/exploration stage<br/>
                                - make sure to NOT introduce any bias on the modeling due to this<br/><br/>

                                Normalization<br/>
                                - a column (ex: age centered around a value ex: 40) make sure a column is centered around mean for that column so that the weight is not uneven on the data/column<br/><br/>

                                    ```<br/>
                                    from sklearn import preprocessing<br/><br/>

                                    scalar = preprocessing.StandardScalar()<br/>
                                    scaled = scaler.fit_transform(my_features);<br/>

                                    ```
                                    <br/>
                                ex: panda to prepare/explore the data and numpy/skikit learn to clean up the array/data that is ready for ML model. feed it to tensor flow to keras to predict a neural network<br/><br/>

                                Correlation Coefficient<br/>
                                A correlation coefficient is a numerical measure of some type of correlation, i.e a statistical relationship between two variables.<br/>
                                The variables may be two columns of a given data set of observations, often called a sample, or two components of a multivariate random variable with a known distribution.<br/>

                                - Covariance: is used when you have a Gaussian (bell curve/normal distribution) relationship between the variables<br/>
                                - Pearsons: is used when you have a Gaussian relationship between the variables<br/>
                                - ex: calculating ROI based on past data resulting around 0.35%. Means no notable correlation with confidence based on resulting coefficientfber<br/>
                                - Spearmans: is used when you have a NON-gaussian relationship between the variables<br/>
                                - Polychoric: is used to understand the relationship of variables gathered via surveys such as personality tests and surveys that uses rating scales<br/>

                                <br/>
                               </div>
                           
                                   
                               </AccordionDetails>
                       </Accordion>    
                       <Accordion>
                           <AccordionSummary>
                              <b>Modeling</b>
                              
                           </AccordionSummary>
                           <AccordionDetails>
                               <div>
                                    Modeling  <br/>  <br/>
                                    - Most important step in ML after collecting, cleaning and training data  <br/>
                                    - Deep learning - how neural works (convolutional and recurring network)  <br/>
                                    - how to identify overfitting, depth of the network, how to optimize  <br/>
                                    - sagemaker, built in algorithms. model tuning  <br/>
                                    - comprehend, translate, polly, transcribe, lex  <br/>
                                    - evaluation of results - accuracy, precision,computing  <br/>  <br/>

                                    Training Model Steps  <br/>
                                    1. Gather/Engineer data into your dataset. for training, testing etc.,  <br/>
                                    - Ex: using boto3, s3.create_bucket, urlib.request.urlretrieve, pd.read_csv etc.,  <br/>
                                    2. Randomize the dataset  <br/>
                                    - ex; using "model_data.sample(fract=1, random_state=1729), [int(0.7 & len(model_data))]"  <br/>
                                    3. Split the dataset into train and test datasets  <br/>
                                    - ex: train_data, test_data = np.split(model_data.sample(fract=1, random_state=1729), [int(0.7 & len(model_data))])  <br/>
                                    4. Choose best algorithm  <br/>
                                    - ex: prefix = 'sagemaker/DEMO-xgboost-dm'  <br/>
                                            
                                            containers = {JSON.stringify({'us-east-1':'1234567890.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest' })} <br/> <br/>
                                            
                                    5. Load Container for chosen model <br/>
                                    - ex: sess = sagemaker.Session() <br/>
                                            ```
                                                xgb = sagemaker.estimator.Estimator( containers[my_region], role, train_instance_count=1,  <br/>
                                                                                train_instance_type='ml.m4.xlarge', output_path=my_s3_path,  <br/>
                                                                                sagemaker_session=sess) <br/>
                                                xgb .set_hyperparameters(max_depth=5, eta=0.2, gamma=4, min_child_weight=6, num_round=100) 
                                                ```    
                                                <br/>
                                    6. Manage compute capacity <br/>
                                        - ex:train_instance_type='ml.m4.xlarge', <br/>
                                    7. Create an instance of chosen model <br/>
                                    8. Define the model's hyperparameter values <br/>
                                        - ex: num_round, subsample etc., <br/>
                                    9. Train the model <br/>
                                        - ex  xgb_fit 'train': s3_input_train <br/> <br/>
                                            
                                        
                                        <br/><br/>

                                    Deploying the model <br/>
                                    - creates an endpoint configuration and eventually provide an https endpoint<br/>
                                    - evaluate the performance and accuracy model<br/>
                                    - provide instance count which can receive inference requests<br/>
                                    - ex: xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='m1.m4.xlarge')<br/>
                                    - to run test data and predict inferences<br/>
                                    - ex: predictons = xgb_predictor.predict(test_data_array).decode('utf-8')<br/><br/>

                                    Models<br/>
                                    - supervised learning<br/>
                                    - target variable (dependent variable) to be predicted from independent variables<br/>
                                    - mostly used - linear regression, logistical regression, random forest, gradient boosted trees, nearest neighor, support vector machines, neural networks, decision trees, naive bayes<br/>
                                        - classification: categorical target. ex: image classification<br/>
                                        - regression: continuous target. ex: sales trend forecasting<br/>
                                        - Note: knn, kernel svm, decision trees, multilayer perceptron are non linear<br/><br/>
                                    - unsupervised learning<br/>
                                    - no targets or outcome to predict, used for clustering and dimensionality reduction<br/>
                                    - most used: k means clustering, t-distributed stochastic neighbor embedding, principal component analysis (PCA)<br/>
                                        - clustering: discrete. ex: ad campaign targeting. segment the observations into meaniful groups or clusters based on the patterns of observations<br/>
                                        - dimensionality reduction: continuous. ex: feature extraction (PCA). distill the relevant information for the observations while reducing the number of features<br/><br/>
                                    - reinforcement learning<br/>
                                    - train to make specific decisions. train itself continuously using trial and error<br/>
                                    - mostly wide - Q-learning, temporal difference (TD), monte-carlo tree search, asynchronous actor-critic agents (a3c)<br/>
                                        - classification: categorical target. ex: personalized recommendation<br/>
                                        - control: reward + or - reward signal. ex: autonomous robots<br/>
                                    - used to update your model as new inference observations are encountered continually. ex: data center cooling unlabeled data for chillers, pumps, cooling units, actual load from systems usage etc.,<br/>
                                    - usecase: automatic vacuum robot determining efficient path across the floor of a room - Simulation based reinforcement learning (where the model learns through trial and error)<br/><br/>


                                    Non Linear Classification<br/><br/>

                                    Decision Trees<br/>
                                    - Scenarios where we need to classify based on binary classifier (and non linear)<br/>
                                    - ex: clasify user behavior for fraudulent activity based on two features like age of the account and transaction month<br/>
                                    - provides highest recall <br/>
                                    - for non linear classification, need multilayer perception. you need Kernel SVM<br/>
                                    - decision trees clasifiers are structured as tree<br/>
                                    - how: what color is the cat in the photo. - steps: single color or contains color<br/>
                                    - note this is a classification algorithm and is not a good fit for continuous value prediction problem<br/><br/>

                                    Logistic Regression<br/>
                                    - provides only classification and does not provide numeric values<br/>
                                    - scenarios where we try to classify and estimate a discrete ovalue (on/off, 1/0) based on a set of independent variables<br/>
                                    - usecase: fraud detection model<br/>
                                    - NOTE: "class probability threshold" makes the model more sensitive. usecase: fraud case detection scenario if not detecting accurately, consider decreasing the class probability threshold. not this comes with lowered precision<br/>
                                    - Logistic regression classification technique (Regression classification) is one of the recommended method for fraud detection and it also generates probability<br/><br/>

                                    Naive Bayes<br/>
                                    - classification algorithm and is not a good fit for continuous value prediction problem<br/>

                                    - Multinomial Naive Bayes: document classification to know "frequency" of given word from your vocabulary on observed text. ex: a word (from your vocabulary) appears in the given post text or not.<br/>
                                    - Bernoulli Naive Bayes: document classification to know if a word from your vocabulary appears on observed text or not. ex: check if an inappropriate word appears in a given post or not<br/>
                                    - Gaussian Naive Bayes: algorithm works on continuous values in your observations not discrete values<br/><br/>

                                    support vector machine - SVM<br/>
                                    - non linear kernel supports non linear model<br/>
                                    - used to build binary classifier based on couple of features to identify user behavior (ex: fradulent or not) <br/>
                                    - "kernel trick" is used to learn a linear classifier to classify a non linear dataset<br/>
                                    - transforms linearly inseparabled data into linearly separable one by projecting into higher dimension. a kernel function is applied on each and eventually they become linearly separable<br/><br/>

                                    Linear Classifiers<br/>
                                    - SVM, Naive Bayesian, Single Perceptron <br/><br/>

                                    Perceptron<br/>
                                    - aka artificial neuron<br/>
                                    - inspired by way neuron works<br/>
                                    - single perceptron (and also logistic regression) with tanh activation function provide classification of linear data<br/>
                                    - perceptron "fires" if the inputs sum value is above threshold. i.e outputs positive if above threshold otherwise negative<br/>

                                    - multilayer perceptron - combines multiple perceptrons. each layer's outputs feeds into the next layer as input, aka "feed forward" network<br/><br/>


                                    How to choose<br/>
                                    - Optimizing an objective function through interactive with an environment than use reinforcement learning<br/>
                                    - discrete or continuous<br/>
                                    - target variable or not<br/><br/>


                                    
                                    Data type      | Supervised learning | Unsupervised learning<br/>
                                    *******************************************************************<br/>
                                    Discrete       | Classifiation       | Clustering<br/>
                                    -------------------------------------------------------------------<br/>
                                    Continous      | Regression          | Dimensionality reduction<br/>
                                    --------------------------------------------------------------------<br/>
                                    <br/>
                                    Hyperparameters<br/><br/>

                                    - a parameter whose value is set before the learning process<br/>
                                    - two types <br/>
                                    - model hyperparameters: influences the performance of the model<br/>
                                    - algorithm hyperparameters: affect the speed and quality of the learning process<br/>
                                    - Hyperparameter tuning - Automatic model tuning - find the best version of model by many training jobs on the dataset using algorithm ranges <br/>
                                    - using JSON objects. pass values to "HyperParameterTuningJobConfig". name of the job is "CreateHyperParameterTuningJob". specify the ranges to tune, specify the objective metric <br/>
                                    - two approaches<br/>
                                    - random search   - chooses a random combination of values for each training job<br/>
                                    - bayesian search - performs hyperparameter tuning as a regression problem. one algorithm looking for another to evaluate<br/>
                                        - usecase: SageMaker XGBoost algorithm to find truck efficiency to run without empty with AUC (area under the curve) metric as objective. <br/><br/>


                                    Hyperparameters scaling<br/>
                                    - integer and continuous hyperparameter ranges<br/>
                                    - choose the scale for hyperparameter tuning to use to search the values using scaling type<br/>
                                    - Types: <br/>
                                    - Auto: chooses the best scale<br/>
                                    - Linear: searches the value in range using a linear scale<br/>
                                    - logarithmic: searches values in range using logarithmic scale. works only  for greater than zero, use when there is several orders of magnitude<br/>
                                    - reverse logarithmic - searches values in range using reverse log scale. when continuous hyperparameter ranges between 0 and 1.0<br/>
                                    

                                    DEEP LEARNING<br/><br/>
                                    

                                    Biolgical inspiration - Human brain has Neuron in your cerebral cortex connected via axions<br/><br/>

                                    a neuron "fires" send a signal to neurons connected.  when it reaches a and triggers to other individual<br/>
                                    layers of neurons connected will yield a learning behavior - ex: billions of neurons each with 1000s of connections yield a mind<br/><br/>

                                    Cortical columns<br/>
                                    - neural are arranged into may "stacks" or  "columns"<br/>
                                    - "mini-columns" of around 100 neurons per mini columns<br/>
                                    - arranged into further "hyper columns"<br/><br/>

                                    Similarly the GPUs for ML work<br/>

                                    Deep learning containers are a set of Docker Images used for training and serving models in TensorFlow, PyTorch and Apache MXNet. <br/><br/>

                                    Deep neural networks<br/>
                                    
                                    - artificial neurons inspired by above neuron <br/>
                                    - a ML model input a feature data, and predict the output labels at the top<br/>
                                    - input is trained with data based on ideal weights<br/>
                                    - job of the neural network is to learn the weights and bias of this<br/><br/>

                                    Deep learning frameworks - Tensorflow/Keras, MXNet<br/>
                                    - programming framework that does this.<br/>
                                    - need parallel processing/compute at scale to do above<br/>
                                    - can be many GPUS, many nodes to compute in parallel. <br/><br/>

                                    Horovod distributed framework - tensorflow code can implement this . parallelize the training to as many machines as needed<br/>
                                    - horovod allows to distribute workload among multiple compute nodes.<br/>
                                    - available for both cpu and gpu aws compute instances. <br/>
                                    - horovod follows MPI (message passing interface) model in all-reduce fashion. <br/>
                                    - this standard for passing messages and managing communication between nodes in high performance distributed computing environments. <br/>
                                    - sagemaker automates the horovod cluster setupand runs the appropriate commands to make sure that training goes smoothly without the need to manage clusters directly<br/><br/>


                                    Types of Neural Networks<br/>
                                    - Feedforward neural networks - basic neural network where we feed feature data and predict output<br/>
                                    - Convolutional Neural Networks (CNN)<br/>
                                    - built to deal 2 dimensional data. ex: find  a stop sign<br/>
                                    - Image classification<br/>
                                    - Recurrent Neural Networks (RNNs)<br/>
                                    - Deals with sequences in time - predict stock proces, understand words in a sentence, translation etc.,<br/>
                                    - LSTM - long short term memory, GRU - gated recurring memory<br/>
                                    - can be used to perform sentiment analysis<br/><br/>
                                    
                                    
                                    
                                    Activation Functions<br/>
                                    - different types that can determine the performance<br/>
                                    - sums up input of all incoming neurons and returns the output<br/>

                                        - linear activation - <br/>
                                        - more like passthrough. gets input and outputs<br/>
                                        - doesnt do any transformation/do anything<br/>
                                        - cant do backward propagation<br/>
                                        - no interesting learning<br/>
                                        - no point to have multiple layers since it returns the same data<br/>
                                        - NOTE generally not used<br/><br/>

                                        - Binary Step Function<br/>
                                        - whether it belongs to a catgory -  either on or off<br/>
                                        - can't handle multiple classification - ex in a large data we dont just do whether some logic exist or not. could be more than that<br/>
                                        - vertical slopes dont work with calculus<br/>
                                        - NOTE: simple in concept but very limited. not used <br/><br/>
                                        
                                        - Non linear activation functions<br/>
                                        - complex mapping between inputs and outputs<br/>
                                        - not just limited to a binary or passing data. actual mappings happen based on input data<br/>
                                        - allows backpropagation. they have an useful derivative<br/>
                                        - allows multiple layer - linear functions degenerate to a single layer<br/><br/>
                                        
                                        Sigmoid (Logistic Activation functions) / TanH (Hyperbolic function)<br/>
                                        - Nice and smooth<br/>
                                        - sigmoid - scales everything from 0 to 1<br/>
                                        - tanH - scales everything from -1 to 1<br/>
                                        - TanH preferred over Sigmoid. generally suited for RNNs<br/>
                                        - NOTE: VANISHING GRADIENT changes slowly for high or low values. <br/>
                                        - compute intensive<br/><br/>
                                        
                                        Rectified Linear Unit (ReLU)<br/>
                                        - avoids sigmoid/tanh limitations<br/>
                                        - easy, fast and popular choice.<br/>
                                        - NOTE: Dying ReLU problem: when inputs are zero or negative values degenerate back to linear functions   <br/>
                                            "Leaky ReLU" - solves "dying ReLU" by introducing a negative slope below 0 <br/>
                                            "Parametric ReLU (PReLU)" - slope in the negative part is learned via backpropagation. complicated and YMMV<br/>
                                            "Exponential Linear Unit-ELU" - like leaky. instead of straight line on negative but exponential. calculus favors this than leaky<br/>
                                            "Swish" - from google, performs really well. mostly a benefit with very deep networks (40+ layers)<br/>
                                            "Maxout" - Outputs the max of the inputs. Technically ReLU is a special case of maxout. but doubles the parameters that need to be trained and not practical<br/>
                                        - ReLU has output of zero if the input is less than zero, and raw output otherwise. This is not probability distribution<br/>
                                        - ReLU should only be used within hidden layers of a neural network model<br/><br/>
                                            
                                        Softmax<br/>
                                        - final output layer of a multiple classification problem<br/>
                                        - takes output from last layer and converts into probabilities for each classification that each neuron repres. so can choose the max if a classification is needed to be chosen<br/>
                                        - cant produce more than 1 label. ex: if we want to check multiple things inside a picture. it will not. sigmoid will do<br/><br/>
                                        
                                        Choosing an Activation function<br/>
                                        - Multiple clasification - use softmax on the output layer<br/>
                                        - RNNs - use TanH<br/>
                                        - For everything else - ReLU followed by Leaky ReLU, follwed by PReLU, Maxout and if not Swish for really deep networks <br/><br/>
                                        

                                    CNNs - Convolutional Neural Networks<br/>
                                    taking source data (image or any source), breaking into chunks (convolutions) and assemble/look for patterns and increasingly higher complexities at higher level in neural network<br/><br/>

                                        - usually for image clasification. <br/>
                                        - but if we are trying to find a feature or pattern but we dont know where it is (ex: to find a stop sign, image classification)<br/>
                                        - Images that you want to find features within<br/>
                                        - machine translation<br/>
                                        - sentence classification - find a noun or verb in a sentence which you may not know<br/>
                                        - sentiment analysis - exmple a happy, sad, frustrated sentiment<br/>
                                        - find features that are not in a specific spot<br/>
                                        - when you have data that doesnt neatly align into columns<br/>
                                        - Example to deal with 2 dimensional data. <br/>
                                        -feature location invariant<br/>
                                        - Softmax - activation function that has output that is a "probability distribution" which means not only output to a [0,1] range but also each output in such a way that the total sum is 1<br/>


                                        - Inspired by biology of visual cortex. ex: how the image seen by the eyes are processed by the brain  <br/>
                                        - "local receptive fields" are groups of neurons that onlyh respond to a part of whae see ("subsampling")<br/>
                                        - "convolutions" - overlap each other to cover entire visual field. break up the data into smaller data and process individually<br/>
                                        - feed into higher layers that identify complex images<br/>
                                        - ex: for complex images, layers for red, green and blue<br/><br/>
                                        
                                        CNNS with Keras/Tensorflow<br/>
                                        - source data must be approproate dimensions/shape (i.e width x length x color channels)<br/>
                                        black and white/gray scale. can do with single point. for color image - RGB<br/>
                                        - special types of layers like Conv2D etc<br/>
                                        - Conv2D layer type does the actual convolution on a 2D image. Conv1D, Conv3D also available - doesnt have to be image data<br/>
                                        - MaxPooling2D layers - reduce a 2D layer down by taking max value in a given block. just the way of shrinking the data for procesing. distill it down to bare essence of data to analyze<br/>
                                        - Flatten layers - convert 2D layer to 1D for passing into a flat hidden layer of neurons<br/>
                                        Typical usage (for image procesing)<br/>
                                        - Conv2D -> MaxPooling2D -> Dropout -> Flatten -> Dense -> Dropout -> Softmax<br/><br/>
                                        
                                        - cnns are very compute intensive. heavy cpu/gpu intensive<br/>
                                        - lots of hyperparameters - kernel sizes, many layers with different number of units, amount of pooling, number of layers, choice of optimizer etc.,<br/>
                                        - getting the training data is often the hardest part<br/><br/>
                                        
                                        CNN tuning<br/>
                                        - specialized architecures exist - that define layers, padding and hyperparameters<br/>
                                        - LeNet-5<br/>
                                        - good for handwriting recognition<br/>
                                        - AlexNet<br/>
                                        - image clasification deeper than LeNet<br/>
                                        - GoogLeNet<br/>
                                        -  even deeper with performance<br/>
                                        - inception modules - groups of convolution layers<br/>
                                        - ResNet (Resident Network)<br/>
                                        -  even deeper. <br/>
                                        - skip connections - to maintain performance<br/>
                                        - most sophisticated<br/><br/>
                                        - has special connections between the layers of the perceptrons to further accelerate upon things. <br/>
                                        - Ex: ResNet50 comes in image classification in the world of aws within sagemaker etc., <br/><br/>
                                        
                                        - usecase: to classify images that supports multi-label classification. <br/>
                                        it scales to millions of images at high resolution. it solves this problem through convolution and multiple layers in the neural network. <br/>
                                        Note: object detection algorithm can also identify all instances of an object within an image. but will not be able to scale.<br/><br/>
                                    
                                    
                                    Recurrent Neural Networks<br/>
                                    - Deals with sequences in time - predict stock proces, understand words in a sentence, translation etc.,<br/>
                                    - LSTM - long short term memory, GRU - gated recurring memory<br/><br/>

                                        Time series data <br/>
                                        - sequences of data. sequence in timee. processing timeseries data, predict something over time etc.<br/>
                                        - weblogs, sensor logs, stock trades<br/>
                                        - where to drive your self driving car based on past trajectories<br/>
                                        sequence doesnt have to be in time. can be any sequences<br/>
                                        ex: language - sequence of words in a sentence to convey a meaning. RNN can make use of words<br/>
                                        machine generated music - music like text like sequence of musical notes. build a note based on music in the past<br/><br/>

                                        A recurrent neuron <br/>
                                        - inputs are fed back again after the training. mathematically preseve data/output back to the same neuron. so that the data from the previous run helps<br/>
                                        - past behavior influences the future in this way<br/>
                                        - unrolling it in time<br/>
                                        - "A memory cell" - maintains memory of previous output <br/>
                                        - sequence of neurons over time (ex: 3 neuron). <br/>
                                        - each neuron gets input and outputs data using their own activation function individually<br/>
                                        - output of each is fed into the next. ex: 1st neuron output to second (additionally each neuron gets their own input/output in previous line)<br/>
                                        - in above case 1st to 2nd, 2nd to output - time step 1 to time step 2 to time step 3<br/>
                                        - NOTE: more recent output (on the memory) can drive the output behavior this can be a problem in some applications.<br/><br/>

                                        A layer of recurrent neuron<br/>
                                        - scaling the neurons horizontally and feed the output to other<br/>
                                        - build complicated leading to other neurons which can deal with more complex vector/sequence of data<br/><br/>

                                        RNN topologies<br/>
                                        - sequence to sequence - input is time series or some series of data, output can be sequence of data<br/>
                                        - predict stock proces based on series of historical data<br/>
                                        - sequence to vector<br/>
                                        - words in a sentence to sentiment<br/>
                                        - vector to a sequence<br/>
                                        - static image and return captions (produce a sequence)<br/>
                                        - Encoder - Decoder<br/>
                                        - chain encoders and decoders and feed into each other<br/>
                                        - get a sequence of words from one language and build up (embeddy layer) and produce a sequence of words in english<br/><br/>

                                        Training RNNs<br/>
                                        - sometimes harder than CNN -  very sensitive to topologies and to choice of hyperparameters<br/>
                                        - very resource intensive. wrong choice can lead to a rnn that doesnt have coverage (or useless)<br/>
                                        - backpropagation through time - just like backpropagation not just through neural network but also through time can be tricky but applied at each time step<br/>
                                        - all those time steps add up fast - will end up in deep/deeper neural large network<br/>
                                        - sometimes we limit this the number of steps (truncated propagation through time)<br/>
                                        - state for earlier time steps gets diluted over time - newer behavior could override the first one (in memory cell)<br/>
                                        - LSTM - long short term memory cell - maintain separate short term and long term states. use this if you do not want to give preferential data from the recent behavior. LSTM with SELU is not ideal for binary classifier<br/>
                                        - GRU - gated recurrent unit - simplication on LSTM. if you need compromise on performance vs time to train. GRU are very popular<br/><br/>
                                        
                                    
                                    On AWS <br/>
                                    - can use  EC2/EMR<br/>
                                    - EMR supports apache MXNet and GPU instance types<br/>
                                    - generally P3, P2, G3 instance types <br/>
                                    - Deep learning AMIs available with tensorflow or mxnet available<br/>
                                    - Sagemaker is also available<br/>
                                    <br/>
                                    

                                    Tuning Neural Network<br/>

                                    - Learning Rate<br/>
                                    - example of hyperparameter - influences the quality of the model(similar to topology and feature of the model)<br/>
                                    - neural networks are trained by "Gradient descent" (or similar means) - TODO<br/>
                                    - we start at some randome point and sample different solutions (weights) seeking to minimize cost functions over many "epochs" <br/>
                                    - learning rate - how far apart the samples are<br/>
                                    - learning rate hyperparameter ranges from 0.0 to 1.0. higher the number (hyperparameter value), the model learns quickly. ex: 0.8 as against 0.1<br/>
                                    - Note: Too high a learning rate means you might overshoot the optimal solution<br/>
                                        - too small a learning rate will take too long to find the optimal solution<br/><br/>

                                    Epoch<br/>
                                    - In most cases, it is not possible to feed all the training data into an algorithm in one pass. This is due to the size of the dataset and memory limitations of the compute instance used for training. <br/>
                                    There is some terminology required to better understand how data is best broken into smaller pieces.  <br/>
                                    - An epoch elapses when an entire dataset is passed forward and backward through the neural network exactly one time. <br/>
                                    - Epoch can be described as one complete cycle through the entire training dataset and indicates the number of passes that the machine learning algorithm has completed during that training.<br/>
                                    - If the entire dataset cannot be passed into the algorithm at once, it must be divided into mini-batches.  <br/>
                                    Batch size is the total number of training samples present in a single min-batch.  <br/>
                                    An iteration is a single gradient update (update of the model's weights) during training.  <br/>
                                    The number of iterations is equivalent to the number of batches needed to complete one epoch.  <br/><br/>


                                    Batch Size<br/>
                                    - how many training samples are used within each epoch<br/>
                                    - not always higher the sample size is good<br/>
                                    - sometimes smaller batch size can work their way out of "local minia" more easily<br/>
                                    - batch sizes that are too large can end up stuck in long solution<br/>
                                    - random shuffling at each epoch can make this look like very inconsistent results <br/>
                                    - NOTE: <br/>
                                        - small batch sizes tend to not get stuck in local minima<br/>
                                        - large batch sizes can converge on the wrong solution at random<br/>
                                        - large learning rates can overshoot the correct solution<br/>
                                        - small learning rates increases training time<br/>
                                        - decreasing batch size would help get out of local minima saddles. decreasing learning rate would prevent overshooting the global loss function mnimum<br/>
                                        - if the learning rate is too large, it would cause the accuracy to oscillate. if the learning rate is too small it will take very long to get to the bottom.<br/><br/>
                                    
                                    
                                    Regularization techniques<br/>
                                    - training accuracy is higher than validation accuracy. model is overfitting and regularization can help address this.<br/>
                                    - trainging data - fed for training<br/>
                                    - evaluation data - training set set aside to evaluate the accuracy of the model<br/>
                                    - test data - for fully trained model to validate the finished model<br/>
                                    - Overfitting - models that are good making prediction on training data but not on new data that it has not seen (test/evaluation data)<br/><br/>

                                    What:<br/>
                                    - regularization are used to prevent Overfitting<br/>
                                    - overfitted models have learned patterns in teh training data that dont generalize to the real world<br/>
                                    - higher accuracy on training data but lower accuracy on test/evaluation data set<br/>

                                    How:<br/>
                                    - Too many layers or too many neurons <br/>
                                    - sometimes simple model, fewer neurons/layers could help<br/>
                                    - Dropout<br/>
                                    - remove some neurons at random at your training set from network. forcing the model to spread out its learning across network<br/>
                                    - prevent any individual neuron to overfit<br/>
                                    - sometimes it can be counter-intuitive. but used standard in CNNs<br/>
                                    - can be applied for overfitting models<br/>
                                    - Early stopping<br/>
                                    - sometimes after certain epoch further training doesnt really do any good or improve<br/>
                                    - ex: accuracy improves/more on the training set over a time whereas on the actual training set it doesnt improve much<br/>
                                    - automatically detect the validation accuracy if it is getting leveled out. so stop at 5 instead of doing 10 epochs<br/><br/>

                                    
                                    Gradients<br/>

                                        What:<br/>
                                        Vanishing Gradient problem - gradient approaching zero (slope approaches zero/bottom of the curve)<br/>
                                        - when the slope of the learning curve approaches zero, things can get stuck<br/>
                                        - we end up working with very small numbers that slow down training or even introduce numerical errors<br/>
                                        - becomes a problem with deeper networks and RNNs as these "vanishing gradient" propagate to deeper layers<br/><br/>

                                        Exploding gradient - Opposite problem of vanishing gradient<br/>
                                        - more vertical at the beginning<br/>

                                        How to fix:<br/>

                                        Multi-level hierarchy - break up levels into their own sub networks trained individually<br/>
                                        LSTM - long short term memory<br/>
                                        Residual Networks like ResNet, Ensemble of shorter networks<br/>
                                        Better choice of activation functions  - ReLU is a good choice to avoid vanishing gradient problems<br/>

                                        Gradient Checking:<br/>
                                        - A debugging technique/ diagonostic tool to check<br/>
                                        - numerically check the derivates computed during training<br/>
                                        - usually happens in the lower level/code. probably may not be writing code<br/><br/>

                                    
                                    L1 and L2 Regularization<br/>

                                    - prevents overfitting in ML in generalize<br/>
                                    - adding weights. how well they are added<br/>
                                    - L1 is sum of weights<br/>
                                    - perform "feature selection" - entire features go to 0<br/>
                                    - computationally inefficient<br/>
                                    - sparse output<br/>
                                    - L2 is sum of square of weights<br/>
                                    - all feature remain considered, just weighted<br/>
                                    - computatationlly efficient<br/>
                                    - Dense output<br/><br/>
                                    
                                    When to use L1 over L2<br/>
                                    - feature selection can reduce dimensionality. ex: some features dont matter. only subset matters than use L1 to avoid computational efficiency<br/>
                                    - ex: out of 100 features may be only 10 end up with non zero coefficients<br/>
                                    - if you think all of your feature are important L2 is a better choice.  <br/>

                                    
                                    Confusion Matrix<br/>
                                    - sometimes accuracy doesnt tell the whole story<br/>
                                    - True positive, true negative, false positive, false negatives are important<br/>
                                    - ideally should have TP or TN as big numbers and have FN and FP to be less<br/>
                                    - example - "image has a cat in it"<br/>
                                                    Actual YES           Actual NO <br/>
                                    ----------------------------------------------------<br/>
                                    Predicted YES    TRUE  POSITIVES      FALSE POSITIVE<br/>
                                    Predicted NO     FALSE NEGATIVE       TRUE NEGATIVE<br/>


                                    <br/><br/>
                                                        Actual CAT    Actual NOT CAT   TOTAL<br/>
                                    -------------------------------------------------------------<br/>
                                    Predicted CAT         50      		5                55   <br/>
                                    Predicted NOT CAT     10      		100              110   <br/>
                                    -----------------------------------------------------------<br/>
                                                        60            105<br/>
                                    -------------------------------------------------------------<br/>
                                                        
                                    <br/>

                                    Measuring your Models	<br/>

                                    RECALL = True positive / (True positives + False Negatives)<br/>
                                        - Also known as Sensitivity, true positive rate, completeness<br/>
                                        - percent of positives rightly predicted<br/>
                                        - ex: fraud detection<br/>
                                        - good choice of metric when you care a lot<br/>


                                        <br/>
                                                            Actual Fraud    Actual Not Fraud<br/>
                                        -------------------------------------------------------------<br/>
                                        Predicted Fraud         5      		    20                <br/>
                                        Predicted NOT Fraud     10      		100              <br/><br/>

                                        Recall = TP/(TP+FN) = 5/(5+10) = 33% <br/><br/>


                                    PRECISION = True positives / (True positives+false positives)<br/>

                                        - Also known as "Correct Positives"<br/>
                                        - percent of relevant results<br/>
                                        - good choice of metric when you care a lot about false positives<br/>
                                        - ex: medical screening, drug scanning - do not say when someone is on drug/cocaine when they are not<br/><br/>

                                                                Actual Fraud    Actual Not Fraud<br/>
                                            -------------------------------------------------------------<br/>
                                            Predicted Fraud         5      		    20                <br/>
                                            Predicted NOT Fraud     10      		100     <br/><br/>
                                            
                                            Precision = TP/(TP+FP) = 5/(5+20) = 20% <br/>
                                            <br/>
                                        
                                    Other metrics<br/><br/> 

                                        SPECIFICITY = True negatives / (true negatives/false positive)<br/>
                                        - True negative rate<br/><br/>

                                        F1 SCORE = 2TP/(2TP + FP + FN)<br/>
                                        also <br/>
                                        F1 SCORE = 2 times (PRECISION times RECALL )/(PRECISION+RECALL)<br/>
                                        - mathematically harmonic mean of precision and sensitivity<br/>
                                        - when we care both precision and recall<br/>
                                        - f1 score is generally short cut in real world expectation is for a precision or sensitivity or recall<br/>
                                        - usecase: if dataset is highly imbalanced (fraud vs non-fraud dataset where only very small % of dataset was labeled as fradulent)<br/>
                                        resample the dataset(oversampling/undersampling), use F1 score as objective metric and apply XGBoost<br/><br/>

                                        RMSE - root mean square errors<br/>
                                        - common metric for regression evaluation<br/>
                                        - rmse is a distance between the predicted numeric target and the actual numeric answer (ground truth). <br/>
                                        - smaller the value of rmse, the better is the predictive accuracy of the model. <br/>
                                        - a model with perfect predictions will have RMSE of zero<br/>
                                        - add up all the squared errors from the prediction<br/>
                                        - accuracy measurement. as alpha increases the model becomes more conservative <br/>
                                        - only cares about right and wrong answers<br/>
                                        - usecase: to check if a model is more frequently overestimating or underestimating a target value on a regression model<br/>
                                        - NOTE: regression problem where we are solving for a continuous variable (NOT binary classification)<br/><br/>


                                        ROC Curve - receiver operating characteristic curve<br/>
                                        - plot of True positive curve (recall) vs false positive rate at various threshold<br/>
                                        - points above diagonal represent good classification (better than random)<br/>
                                        - ideal curve would be a point in the uppper left corner<br/>
                                        - more its "bent" towards upper left the better.<br/>
                                        - NOTE: evaluation technique where you are solving for a binary variable. ex: if a person will buy or not buy based on multiple attributes of sale, property and customer profile etc.,<br/><br/>

                                        AUC - Area under the curve (ROC curve)<br/>
                                        - probability that a classifier will rank a randomly chosen positivity higher over a randomly chosen negativity rank<br/>
                                        - ROC AUC is 0.5 is a useless classifier, 1.0 is perfect<br/>
                                        - commonly used for comparing classifiers<br/>
                                        - NOTE: AUC/ROC are used for classification type problems and not for regression algorithm. MAE is used for regression evaluation<br/><br/>
                                        
                                        MAE - Mean Absolute Error<br/>
                                        - regression evaluation metric<br/>
                                        - use when outliers can significantly influence your dataset. <br/>
                                        - usecase: housing data prices that are clustered by region across the dataset. when dataset contains several outliers per region<br/>
                                        - NOTE: AUC/ROC are used for classification type problems and not for regression algorithm. MAE is used for regression evaluation<br/><br/>
                                    

                                    Ensemble learning - bagging and boosting<br/><br/>

                                    ensemble method<br/>
                                    - random cut forest (forest is set of trees - decision trees). avoids overfitting better than individual decision trees<br/>
                                    - takes multiple models (variations of different model) but finalizes on one model<br/>
                                    - decisiion tree created using different vote on the result/model finally pick one<br/><br/>

                                    Bagging - would generate mulktiple training set/models <br/>
                                    can trainings in parallel. ex: classification model1,2, 3 and finally choose one<br/>

                                    Boosting - serial manner. start with equal assign weights to dataset points. over time refine the weights of the model observations<br/>

                                    how to choose one over other <br/>
                                    - Boosting is hot thes days. for accuracy use boosting. XGBoost has higher accuracy<br/>
                                    - Bagging avoids overfitting. spreads data and picks up/averages out the model<br/>
                                    bagging is easy to do parallel execution. boosting is more sequential<br/>
                                    <br/>

                                    Refer sagemaker for model tuning options<br/><br/>


                                    Evaluate the model<br/>
                                    -  after training and deploying the model, evaluate to determine the perf and accuracy- o<br/>
                                    - often generate multiple models with different algorithms/hyperparameters and evaluate each<br/>
                                    - two different validation approaches<br/>
                                    - offline testing - use historical data to send requests to the model for inferences<br/>
                                    - online testing with live data; use production variants (like A/B testing<br/>
                                    - options for offline evaluation<br/>
                                    - holdout set: set aside a subset of data for evaluation after training<br/>
                                    - K-fold validation; split the example dataset into k parts treat each as a holdoutset for k training runs<br/><br/>

                                    
                                    Machine learning Modeling with Sagemaker<br/>

                                    - Generate example data to use in training<br/>
                                    - Train the model to make predictions or interferences<br/>
                                    - 1) Use an algorithm and example data to train the model<br/>
                                    - 2) Evaluate the model for inference accuracy<br/>
                                    - 3) Integrate model into your application to generate inferences in real time and at scale<br/><br/>
                                    
                                    
                                    ex architecture - flow <br/>
                                    1) Training Data in S3 -  Firefose, ground truth, mechanical turk<br/>
                                    2) Code in Ec2 Container registry - Model by writing training code. Code in <br/>
                                    3) Inference Code (on ML Compute instances) - Training produces inferences code<br/>
                                    4) Helper code in S3 - model data  based on model from training code<br/>
                                    5) Endpoint - Inference code aling with helper code generating endpoint for inference for client endpoint<br/>
                                    6) Client application - uses the above endpoint in real time<br/><br/>
                                    
                                    Training Algorithm options<br/>
                                    1) Sagemaker built in algorithms<br/>
                                    2) SageMaker debugger<br/>
                                    3) Apache Spark with Sagemaker<br/>
                                    4) Custom deep learning code<br/>
                                    5) Write your own algorithms<br/>
                                    6) AWS Marketplace provided algorithms<br/><br/>

                                    Deployment Options<br/>
                                    1) SageMaker hosting services - Persistent endpoint to get one prediction at a time<br/>
                                    - provides an http endpoint <br/>
                                    - steps: create a model in sagemaker, create an endpoint configuration, create an https endpoint<br/>
                                    2) SageMaker batch algorithm - get predictions for an entire dataset<br/>
                                    - provides inferences for entire dataset (instead of 1 request at a time)<br/>
                                    - steps: create batch transform, run batch transform job, SageMaker saves result in S3 bucket<br/><br/>

                                    - SageMaker can be used to deploy different versions exposed through a single endpoint to configure to route percentage of traffic to each. <br/>
                                    - endpoints are not expsed through ALB and NLB.<br/><br/>

                                    Evaluate the Model<br/>
                                    - Determine the performance and accuracy<br/>
                                    - generate multiple models with different algorithms/hyperparameters and evaluate each<br/>
                                    - compare the best performing version and choose<br/>
                                    - Different validation approaches<br/>
                                    - Offline testing - use historical data to send requests to the model for inferences<br/>
                                        - Holdout set: set aside a subset of data for evaluation after training. ex: 80/20 - training vs evaluation<br/>
                                        - K-fold : split example dataset into k parts. treat each as a holdout set for k training runs<br/>
                                    - Online testing - with live data, use production variants like A/B testing<br/>
                                        - using live data to stream requests into your model<br/>
                                        - several different version of algorithm and hyperparameters could be hit and evaluate based on that<br/><br/>
                                    
                                    SageMaker Model Tracking<br/>
                                    - organize, track, compare and evaluate your ML experiements<br/>
                                    - can be used to search key model attributes such as hyperparameter values, algorithm used and tags associated with your team's model's. <br/>
                                    - SageMaker capability allows you to manager multiple experiments at the scale of upto 1000s of model experiments<br/>
                                    - usecase: experient different datasets, algorithms and hyperparameters to find the best combination of ML problem. <br/>
                                    track several hunderd to over a thousand experiments over the course of the modeling effort. manage experiments at scale<br/><br/>

                                    
                                    SageMaker Model Monitor<br/>
                                    - set alerts for deviations in quality such as data drift<br/>
                                    - Steps: <br/>
                                    - capture data (using endpoints to capture data from incoming requests and resulting predictions)<br/>
                                    - create a baseline - from the data used to train the model <br/>
                                    - schedule monitoring jobs - create monitoring schedule specifying data to collect, how to interpret the data<br/>
                                    - interpret results <br/>

            
            
                                    <br/>
                                </div>  
                               </AccordionDetails>
                       </Accordion>    
                       
                       <Accordion >
                           <AccordionSummary>
                               <b>SageMaker</b>
                               <br/>
                           </AccordionSummary>
                           <AccordionDetails>
                               <div>
                               Amazon SageMaker<br/><br/>
                                    - heart of ML<br/>
                                    - intended to manage entire ML workflow<br/>
                                    - prepare fetch data, feature engineering, training/models, deploy the model, observe in production. <br/>
                                    - learn and gather more data. recycle better feature engineering and redo<br/>
                                    - Amazon SageMaker makes it easy to build ML models and helps to quickly connect the training data, to select and optimize the best algorithm & framework for your application.<br/>
                                    - SageMaker include hosted Jupyter notebooks (to explore & visualize data stored in S3), can connect to data in S3 or use AWS Glue to ove data from RDS, DynamoDB or Redshift into S3 for analysis in your notebook<br/><br/>


                                    <img width="50%"   src="https://d1.awsstatic.com/SageMaker/SageMaker%20reInvent%202020/most_comprehensive_box_no_space%402x.5e35d9542b9311059942552d3804241c9621bf77.png"></img>

                                    <br/><br/>



                                    <br/><br/>

                                    sagemaker helps to spin up instances to train, deploy models etc.,<br/><br/>

                                    Sample use case:<br/>
                                    1) s3 data to training data. code from docker image can read the training data/model<br/>
                                    2) save the trained data back to s3. deploy to production. have another ecr/docker image to read the incoming data and save inferences<br/>
                                    3) pull the inference code from ecr and serve and spin up instances/endpoints to the client models. <br/>
                                    4) api/apps can use the endpoints to validate. ex: check if this is the picture of the cat<br/><br/>

                                    SageMaker notebook - notebook instance spun up from console<br/>
                                    - has access to S3. can train, validate, <br/>
                                    - use spark, tensorflow etc.,<br/>
                                    - prebuilt models/docker images<br/>
                                    - can spin up training instances or dedicated instances/host to perform, deploy to whole feet of instances and predict<br/>
                                    - provide hyperparameters for the model etc,<br/>
                                    - sagemaker console can also be used. notebook can be used for repeatability<br/><br/>


                                    <img    src="https://d33wubrfki0l68.cloudfront.net/c5d13a2fd56b6a3ba6c08e713f517703065e0b89/08959/img/screenshot-786-.png"></img>

                                    <br/><br/>

                                    Data preparations<br/>
                                    - for sagemaker data can be prepared and be set <br/>
                                    - ideal format varies with algorithm - often "RecordIO" or "Protobuf"<br/>
                                    - usually these algorithms also take csv also<br/>
                                    - preprocessing can be done with notebook or Apache Spark to preprocess at scale<br/>
                                    - Scikit_learn, numpy, pandas can all be used from the notebook - to manipulate before feeding<br/><br/>


                                    Create a training job<br/>
                                    - s3 url with data<br/>
                                    - ML compute resources (like G2, P2, P3)<br/>
                                    - s3 output url<br/>
                                    - ECR (docker image) path to training code<br/>
                                    - API CreateTrainingJob: <br/>
                                        - AlgorithmSpecification, HyperParameters, InputDataConfig, OutputDataConfig, <br/>
                                        - ResourceConfig, EnableManagedSpotTraining, RoleArm, Environment, StoppingCondition<br/>
                                        - Note: RoleArn, ResourceConfig (compute instance and storage volumes), OutputDataConfig (S3 where artifacts/model will be stored) are all mandatory <br/><br/>

                                    Training options<br/>
                                    - built in training algorithms<br/>
                                    - spark MLLib<br/>
                                    - custom python tensorflow/MXNet code<br/>
                                    - own/custom docker image<br/>
                                    - algorithm from AWS marketplace<br/><br/>

                                    Deploy<br/>
                                    - save trained model to S3<br/>
                                    - can deploy in two options<br/>
                                    - persistent endpoint for individual predictions on demand<br/>
                                    - SageMaker batch transform to get predictions for entire dataset<br/>

                                    <br/>

                                    <img   src="https://docs.aws.amazon.com/sagemaker/latest/dg/images/batch-transform-v2.png"></img>


                                    <br/><br/>
                                    Other options:<br/>
                                    - inference pipelines for more complex processing<br/>
                                    - sagemaker neo for deploying to edge devices<br/>
                                    - elastic interference for accelerating deep learning models<br/>
                                    - automatic scaling - increase # of endpoints as needed<br/><br/>

                                    Security:<br/>
                                    - seamless integration IAM role/policies.<br/>
                                    - Artifacts are enrypted while in transit (SSL) and at rest<br/>
                                    - KMS integration across S3, Notebooks, training jobs, endpoints<br/><br/>


                                    IMPORTANT - Refer - SageMaker Algorithms section<br/><br/>

                                    Model tuning in sagemaker:<br/>

                                    - hyperparameter tuning<br/>
                                    - what are the best values for learning_rate, batch_size, depth etc.,<br/>
                                    - often you have to experiment<br/>
                                    - no one definite/prescribed way. since its complex and based on case-by-case, need to try every combination of every possible value, train and evaluate (but can become expensive)<br/><br/>

                                    Automatic model tuning<br/>
                                    - define the hyperparameter you care the most, the ranges, metrics <br/>
                                    - sagemaker spins up "HyperParameter tuning job" that trains many combinations - training instances are spun up as needed (potentially a lot of them)<br/>
                                    - It learns as it goes - set of hyperparameters producing the best results can be deployed as a  model<br/><br/>

                                    Best practices<br/>
                                        - dont optimize to many hyperparameters at once (try to focus on whats important)<br/>
                                        - limit  your ranges to as small as range as possible<br/>
                                        - use logarithmic scales when appropriate<br/>
                                        - dont run too many training jobs concurrently- this limits how well the process can learn as it goes<br/>
                                        - while running training jobs on multiple instances, make sure to report the correct objective metric in the end<br/><br/>


                                    SageMaker and Apache Spark<br/><br/>

                                        Spark - preprocessing data as normal with spark and inference in SageMaker (makes use of the best from both the worlds)<br/>
                                        - can generate dataframes. distribute the processing across cluster on spark<br/>
                                        - use sagemaker-spark library<br/>
                                        - SageMakerEstimator (similar to MLLib, after sparr map reduce/data frame loading use this for ML) - KMeans, PCA, XGBoost<br/>
                                        - SageMakerModel<br/><br/>

                                        - connect notebook to a remote EMR cluster running spark (or Zeppelin)<br/>
                                        - dataframe - should have a "Features" columns that is a vector of doubles. an optional labels column of doubles<br/>
                                        - call fit on your SageMakerEstimator to get a SageMakerModel<br/>
                                        - call transform on the SageMakerModel to make inferences<br/>
                                        - works with Spark pipelines as well<br/><br/>


                                        ```<br/>
                                        val estimator = new KMeansSageMakerEstimator(sageMaerRole = IAMRole(<br/>
                                                                    roleArn, trainingInstanceType = "m1.p2.xlarge", trainingInstanceCount = 1,<br/>
                                                                    endpointInstanceType = "m1.c4.xlarge", endpointInitialInstanceCount = 1)<br/>
                                                        .setK(10)<br/><br/>
                                                        .setFeatureDim(784)<br/><br/>
                                        # train				  <br/>
                                        val model = estimator.fit(trainingData)<br/>
                                        val transformedData = model.transform(testData)<br/>
                                        transformedData.show<br/><br/>

                                        ```<br/><br/>


                                    Newer features of SageMaker<br/><br/>

                                    - SageMaker Studio - visual IDE for ML, integrates many of the features<br/>
                                    - create and share Jupyter notebooks with sagemaker studio<br/>
                                    - switch between hardware configurations (no infrastructure to manage)<br/><br/>

                                    SageMaker Experiments <br/>
                                    - organize, capture, compare and search your ML jobs<br/>
                                    - more useful way of visualizing<br/><br/>

                                    SageMaker Debugger<br/>
                                        - saves internal model state at periodic intervals. <br/>
                                        - will save the individual gradients as the models are saved at periodic times<br/>
                                        - define rules for detecting unwanted conditions while training<br/>
                                        - gradients/tensors over time as a model is trained<br/>
                                        - ex: connect cloudwatch/alarm/trigger based on above rules<br/>
                                        - a debug job is run for each rule you configure<br/>
                                        - sagemaker studio debugger dashboard<br/>
                                        - auto generated training reports<br/>
                                        - build in rules - monitor system bottlenecks, profile model framework operations (like tensorflow under the hood), debug model parameters<br/><br/>

                                        - SageMaker Debugger insights Dashboard<br/>
                                        - Debugger ProfilerRule - profilerReport, hardware system metrics (cpu/gpu memory increase etc), framework metrics (max initialization time, overall framework metrifs, stepoutlier etc)<br/>
                                        - Built in actions to receive notifications or stop training - StopTraining(), Email() or SMS(). email and sns using SNS<br/>
                                        - Profile system resource usage and training<br/><br/>

                                        Supported frameworks - tensorflow, pytorch, mxnet, xgoost, sagemaker generic estimator<br/>
                                        debugger apis available in github. can construct hooks and rules for "CreateTrainingJob", "DescribeTrainingJob" apis.<br/>
                                        SMDebug client library that can register hooks for accessing training data<br/><br/>


                                    SageMaker AutoPilot<br/>
                                        - automates algorithm selection, data prepreocessing, model tuning and all infrastructure<br/>
                                        - does all the trial and error fo ryou<br/>
                                        - more broadly called as "AutoML"<br/>
                                        - can add human guidance, option with or without code in SageMaker Studio or AWS SDKs<br/>
                                        - integrates with SageMaker Clarify (see below)<br/><br/>

                                        - load s3 data for training<br/>
                                        - select your target column for prediction<br/>
                                        - automatic model creation<br/>
                                        - model notebook is available for visibility and control<br/>
                                        - model leaderboard - ranked list of recommended models, you can pick one<br/>
                                        - deploy and monitor the model, refine via notebook if needed<br/><br/>


                                        NOTE: <br/><br/>
                                        Limited to:<br/>

                                            Problem types:<br/>
                                            - binary classification<br/>
                                            - multiclass classification<br/>
                                            - regressions<br/><br/>

                                            algorithm types:<br/>
                                            - linear learner<br/>
                                            - XGboost<br/>
                                            - Deep learning (MLPs)<br/><br/>

                                            Data must be tabular csv<br/><br/>


                                    SageMaker Clarify<br/>
                                        - brings transparency on how model arrive and what bias might exist on predictions (ex: above using AutoML)<br/>
                                        - Feature attribution<br/>
                                        - uses SHAP baselines/shapley values<br/>
                                        - research from cooperative game theory<br/>
                                        - assigns each feature an importance value for a given prediction<br/><br/>

                                    SageMaker Model Monitor<br/>
                                    - get alerts on quality deviations on our deployed models (via cloudwatch)<br/>
                                    - visualize data drift -ex: loan model starts giving people more credit due to drifting or missing input features)<br/>
                                    - detect anomalies and outliers<br/>
                                    - detect new features<br/>
                                    - no code needed<br/><br/>

                                    - data is stored in s3<br/>
                                    - monitoring jobs are scheduled wvia monitoring schedule<br/>
                                    - metrics are emitted via cloudwatch (cw notifications)<br/>
                                    - integrates with tensorboard, quicksight, tableau or just visualize in SageMaker studio<br/><br/>

                                    Monitoring types:<br/>
                                    - drift in data quality - relative to base line<br/>
                                    - drift in model quality - accuracy. can integrate with Ground Truth labels (ex: what humans are labeling vs the model)<br/>
                                    - Bias drift - if we are seeing biases on the feature data<br/>
                                    - feature attribution drift - using normalized discounted cumulative gain (NDCG) score, compares feature ranking of training vs live data<br/><br/>


                                    SageMaker model monitor + clarify<br/>
                                    - clarify detects potential bias. ex: imbalances across different groups/ages/income brackets<br/>
                                    - with model monitor you can monitor bias and be alerted to new potential bias via cloudwatch<br/>
                                    - sagemaker clarify also helps explain model behavior . understant what feature contributes bias<br/><br/>


                                    2021 features (may or may not be in exams)<br/><br/>

                                    SageMaker Jumpstart<br/>
                                    - one click models and algorithms from model zoos<br/>
                                    - over 150 open source models in NLP, object detections, image classifications etc.<br/><br/>

                                    SageMaker Data Wrangler<br/>
                                    - preprocess your data within sagemaker<br/>
                                    - import, transform, analyze, export data within sagemaker studio<br/><br/>

                                    SageMaker feature store<br/>
                                    - find, discover and share features in Studio<br/>
                                    - online (low latency) or offline (for training or batch inference) modes<br/>
                                    - features organized into feature groups<br/><br/>

                                    SageMaker edge Manager<br/>
                                    - software agent for edge devices<br/>
                                    - model optimized with sagemaker neo<br/>
                                    - collects and samples data for monitoring, labeling and retraining<br/><br/>

                                    Security:<br/>
                                        - IAM - setup user accounts with least privilege<br/>
                                        - use MFA<br/>
                                        - use SSL/TLS <br/>
                                        - use cloudtrail to log api and user activity. cloudwatch for monitoring/evnts for alarms<br/>
                                        - use encryption<br/>
                                        - be careful with PII<br/><br/>

                                        At rest<br/>
                                            KMS - accepted by notebooks, all sagemaker jobs<br/>
                                            - everything under /opt/ml and /tmp can be encrypted with a kms key<br/>
                                            - training, tuning, batch transform, endpoints<br/><br/>

                                            S3 - can use encrypted s3 buckets for training data and hosting models. s3 can also use kms<br/><br/>

                                        At transit<br/>
                                        - SSL/TLS<br/>
                                        - IAM roles<br/>
                                        - inter node training communication may be optionally encrypted - can increase time and cost with deep leaning<br/>
                                        - enabled via console or api when setting up a training or a tuning job<br/>
                                        - inter-container traffic encryption<br/><br/>


                                        IAM specific permissions<br/>
                                            User permissions:<br/>
                                            - CreateTrainingJob<br/>
                                            - CreateModel<br/>
                                            - CreateEndpointConfig<br/>
                                            - CreateTransformJob<br/>
                                            - CreateHyperParameterTuningJob<br/>
                                            - CreateNotebookInstance<br/>
                                            - UpdateNotebookInstance<br/><br/>

                                            Predefined policies:<br/>
                                            - AmazonSageMakerReadOnly<br/>
                                            - AmazonSageMakerFullAccess<br/>
                                            - AdministratorAccess<br/>
                                            - DataScientist<br/><br/>

                                    Elastic Inference (EI) accelerators<br/>
                                    - Accelerates deep learning inference<br/>
                                    - At fraction of cost of using a GPU instance for inference model<br/>
                                    - EI accelerators may be added alongside a CPU isntance (ex: ml.eia1.medium/large/xlarge)<br/>
                                    - EI accelerators also may be applied to notebooks<br/>
                                    - works with tensorflow and MXNet pre-built containers. ONNX may be used to export models to MXNet<br/>
                                    - works with custom containers built with EI-enabled tensorflow or MXNet<br/>
                                    - works with image classification and object detection built-in algorithms<br/><br/>
                                    - Prerequisites:<br/>
                                    - make sure to provision aws privatelink vpc endpoints for subnets<br/>
                                    - provision instance role with policy that allows users accessing the instance to connect to accelerators<br/><br/>


                                    SageMaker Processing<br/>
                                    - Managed AWS Service that can use to run data engineering workloads in SageMaker using processing APIs<br/>
                                    - SageMaker processing manages your SageMaker environment for you in a processing container<br/>
                                    - Managed service removes much of the infrastructure and coding work need to perform data engineering tasks<br/>


                                    SageMaker Ground Truth<br/>
                                    - automated data labeling using machine learning<br/>
                                    - labelers through Amazon Mechanical Turk<br/>
                                    - data labeled by either human labelers or my ML algorithms<br/>
                                    - usecase to label user's photos, videos. note: there can be user error/mislabel of images and/or videos. <br/>
                                    - to mitigate use "annotation interface" best practices. <br/>
                                    - other feature is "annotation consolidation" <br/>
                                    - send each data object to multiple workers and then consolidate the response into a single label. it then takes their annotations and compared them using annotation consolidation algorithm. the algorithm first detects outlier annotations that are disregarded. <br/>
                                    - bounding box, semantic segmentation are some of the functions that can be used to ensure the accuracy of labeling tasks<br/><br/>


                                    <img width="75%"  src="https://d1.awsstatic.com/SageMaker/Ground%20Truth/product-page-diagram_SageMaker-Groud-Truth_HIW_DARK%402x.78e7dc2b2c74ed4acb4ee0226e9655ae5d404f7b.png"></img>

                                    <br/><br/>

                                    <br/>


                                    <br/>
                               </div>
                           
                                   
                               </AccordionDetails>
                       </Accordion>    
                       <Accordion>
                           <AccordionSummary>
                              <b>SageMaker Algorithms</b>
                              
                           </AccordionSummary>
                           <AccordionDetails>
                               <div>
                                    Algoriths in SageMaker<br/><br/>

                                    - Categorized based on Data<br/>
                                        - Structure<br/>
                                        - linear learner<br/>
                                        - factorization machines<br/>
                                        - xgboost<br/>
                                        - k-means <br/>
                                        - random cut forest<br/>
                                        - Image Data<br/>
                                        - Image Classification<br/>
                                        - Natural language data<br/>
                                        - Sequence2Sequence<br/>
                                        - Neural Topic Modeling<br/>
                                        - Latent Dirichlet Allocation<br/>
                                        - Blazing text<br/>
                                        - Time series data<br/>
                                        - Deep AR<br/><br/>
                                        
                                    - Types<br/>
                                    - Regression Algorithms<br/><br/>
                                    - Clustering Algorithms<br/>
                                        - unsupervised learning algorithm<br/>
                                        - discrete groups within data. group as similar as possible to one another and as different as possible from members of other group<br/>
                                        - define properties to group<br/>
                                        - use cases: find delivery source location (rc: packages), identifying crime centers, customer segmentation, <br/>
                                        - use cases: fraud, detection based on patterns, cyber profiling criminals, clustering of IT alerts, call center recording analysis<br/>
                                        - Sagemaker - K-Means<br/><br/>
                                    - Classification algorithms<br/>
                                        - supervised learning algorithms<br/>
                                        - learns from training data then uses the result to classify new observations<br/>
                                        - two types: binary class or multi class (ex: different breeds of cats, shapes (triangles, squares)<br/>
                                        - use case: voter prediction, customer loan default, object detection, image classification, fraud detection, customer segmentation, product classification<br/><br/>
                                        - sagemaker algorithms: <br/>
                                        - linear learning (can be for regression or classification)<br/><br/>
                                        - blazing text - implements Word2vec and text classification algorithms<br/><br/>
                                        - XGBoost - can do both regression and classification. <br/>
                                            - implementation of gradient boosted trees algorithm<br/>
                                            - supervised learning algorithm for predicting a target combining the estimates from a set of simpler models<br/>
                                            - requires a data matrix of observataions across dimension of features<br/>
                                            - also requires a target column across the observations<br/>
                                            - can differentiate the importance of features through weights<br/>
                                            - use cases: predict credit card default on payments<br/><br/>
                                        - K nearest neighbors<br/>
                                            - find k closes poins to the sample point and gives a prediction of the average of their feature<br/>
                                            - index based<br/>
                                            - objective: build  k-NN index to allow for efficient determination of the distance between points<br/>
                                            - train to construct the index<br/>
                                            - note: when using knn, generally create an index and use PCA to reduce the dimensions - curse of dimentionality<br/>
                                            - use cases: predict wilderness tree types from geological and forest service data<br/><br/>
                                        - Factorization Machines<br/><br/>
                                        - both regression and classification<br/>
                                        - extension of linear model used on high dimensional sparse datasets<br/>
                                        - typically used for sparse datasets such as click prediction and item recommendations<br/>
                                        - scored using binary cross entropy (log loss), accuracy (at threshold=0.5) and f1 score (at threshold=0.5)<br/>
                                        - use case: item recommendation on website (ex: based on data capture/patterns etc)<br/><br/>
                                        - Image classification<br/>
                                            - supervised algorith, multi class classification<br/>
                                            - takes as image as an input and outputs one or more labels assigned to that image<br/>
                                            - uses convolutional neural network (ResNet) that can be trained from scratch or trained using transfer learning when a large number of training images are available		 <br/>
                                            - RecordIo, jpg/png<br/>
                                            - usecases: identify offensive images/twitter feed posts etc<br/><br/>
                                            - Transfer learning <br/>
                                                - scenario where existing trained model does not exist (ex: proprietary auto part), start with pretrained "off the shelf" models. <br/>
                                                - ex: ImageNet[2] has 11million categories with 11000 categories <br/>
                                                - in this case start with pretrained and generalize with datasets by simple re-adjustment or fine tuning<br/>
                                                - a network is initialized with weights (ex: above from imagenet) which can be fine tuned for image classification<br/>
                                            - in SageMaker, this is run in two modes "full training" or "transfer learning" mode<br/>
                                                - in full training mode, the network is initialized with random weight and trained on user data from scratch<br/>
                                                - in transfer learning mode, the network is initialized with weights. training can be achieved with smaller dataset (since the network is already trained)<br/><br/>
                                            - Other models similar to transfer learning<br/>
                                                - online learning - train model incrementally in batches or as individual observations. <br/>
                                                - incremental learning - start with existing model, extending with new data. requires some existing data<br/>
                                                - out of code learning - to train huge datasets that cant be loaded in server's memory. algorithm loads the data & trains on the subset, loads another subset of data and trains etc.,<br/><br/>
                                        - Random cut forest<br/>
                                            - unsupervised algorithm for detecting anomoly data<br/>
                                            - uses an anomoly score<br/>
                                            - low score indicates data point is considered normal. high score indicates presence of anomaly in the data<br/>
                                            - definition of low and high depend on the application. common practice: scores beyond 3 standard deviations from mean score are considered anomalous<br/>
                                            - requires a target column across the observations<br/>
                                            - use cases: find exceptions in streaming trade data<br/>
                                            - Note: Unsupervised Anomaly Detection<br/>
                                                - Do not need training data<br/>
                                                - data groups that appear frequently are assumed as normal traffic. infrequent instances (considerably various from majority) are regarded as malicious<br/>
                                                - Assumption: 1 Most of the network connections are normal traffic and very small percentage is abnormal<br/>
                                                - Assumption: 2 Malicious traffic is statistically various from normal traffic <br/><br/>
                                    - Image analysis algorithms<br/><br/>
                                    - Text analysis algorithms<br/><br/>
                                    - Anomoly detection algorithms<br/><br/>
                                    - Reinforcement algorithms<br/><br/>
                                    - Forecasting algorithms<br/><br/>

                                    Concepts<br/>

                                    - Generalization - to generate a model<br/>
                                    - discrete recommendation or categorical algorithms<br/><br/>


                                    Linear Learner<br/>
                                        - linear regression - idea of fitting a line into a training dataset. predications based on that line<br/>
                                        - have both regression (numerical) and classification (binary or multi class classification)<br/>
                                        - input set of high dimensional vectors including a numeric target or label<br/>
                                            - target is 0 or 1 for binary classification<br/>
                                            - learns a linear threshold function and maps a vector to an approximation of the target<br/>
                                            - optimizes - discrete suited for classification such as f1 measure, precision, recall or accuracy<br/>
                                            - require a data matrix of observations across dimensions of features, also requires a target column across observations<br/>
                                            - use case - predict event outcome: win or lose	  <br/>
                                        - input format - RecordIO wrapped protobuf. Float 32 data only. <br/>
                                            can also take CSV (first column assumed to be label). for unsupervised learning the label_size is set to zero. ex: Metadata Content-Type is identifies as text/csv; label_size=0<br/>
                                            File mode - all training data in single mode<br/>
                                            Pipe mode - pipe it/stream it from S3. more efficient with larger data<br/>
                                        - Preprocessing - training data should be normalized (either upfront or tell the linear learner to do it). <br/>
                                            input data should be shuffled<br/>
                                        - Training - uses SGF - stochastic gradient descent - <br/>
                                            choose an optimization algorithm - Adam, AdaGrad, SGD, etc.,<br/>
                                            multiple models are optimized in parallel and chooses the most optimal<br/>
                                            tune L1, L2 regularization to prevent overfitting<br/>
                                        - Validation - most optimal model is selected<br/><br/>
                                        - HyperParameters<br/>
                                            - Balance_multiclass_weights - gives each class equal importance in loss functions<br/>
                                            - learning_rate, mini_batch_size<br/>
                                            - L1 - regularization<br/>
                                            - Wd - weight decay (L2 regularization) <br/>
                                            - important: feature_dim, predictor_type, loss<br/>
                                        - Instance types<br/>
                                            - single or multi machine (GPU or CPU)<br/>
                                            - NOTE: Multi-GPU does not help<br/><br/>
                                        
                                        - usecase: to determine or predict a numerical value (supervised learning/with target)<br/>
                                            ex: linear learning to identify hand writing<br/>
                                            ex: for discrete classification problem statement (find boolean/binary) set predictor_type to binary _classifier. usecase: should this customer receive based on direct mail campaign. other valid values are multiclass_classifier or regressor<br/><br/>

                                    Factorization Machines<br/>
                                    - used for classification and regression problems, not deep learning predictions<br/>
                                    - Extension of linear model used on high dimensional sparse datasets<br/>
                                    - typically used for sparse datasets such as "click prediction" and "item recommendation"<br/>
                                    - continuous object: Root mean square error<br/>
                                    - example use case: analyze the images of handwritten digits<br/>
                                    - hyperparameters<br/>
                                    - important: feature_dim, num_factors, predictor_type<br/><br/>



                                    XGBoost	<br/>
                                        boosting - serial/sequential manner. start with equal assign weights to dataset points. over time refine the weights of the model observations<br/>

                                        - extreme gradient boosting<br/>
                                        - gradient boosting is a supervised learning algorithm that attempts to accurately predict a target variable by combining an ensemble of estimates from a set of simpler, weaker models.<br/>
                                        - boosted group of decision trees. new trees made to correct the erorrs of previous trees<br/>
                                        - uses gradient descent to minimize loss on new trees<br/>
                                        - lot of kaggle competition<br/>
                                        - very fast<br/>
                                        - can be used for classification<br/>
                                        - also used for regression - regression trees	<br/>
                                        - open source <br/>

                                        - models are serialized and deserialized with "Pickle"<br/>
                                        - can use as a framework within notebooks - "Sagemaker xgboost" or as a built in sagemaker algorithm<br/><br/>

                                        - input format - initially CSV or libsvm. Now RecordIO-protobuf and Parquet as well<br/><br/>
                                        - HyperParameters<br/>
                                        - subsample - to prevent overfitting<br/>
                                        - Eta - step size shrinkage prevents overfitting<br/>
                                        - Gamma - minimum loss reduction to create a partition. larger = more conservative. Optional parameter
                                        - alpha - L1 regularization term. to adjust l1 regulation term on weights. larger = more conservative. Optional parameter<br/>
                                        - base_score - to set the initial prediction score of all instances. Optional parameter<br/>
                                        - lambda - L2 regularization term. larger = more conservative<br/>
                                        - num_class - to set the number of classes. mandatory if the objective is multi:softmax or multi:softprob<br/>
                                        - important: num_round, objective (ex: reg:logistic, reg:sqarederror). <br/>
                                        - Instance types<br/>
                                        - CPUs only for multiple instance training <br/>
                                        - is memory bound NOT compute bound<br/>
                                        - M5 is a good choice<br/>
                                        - XGBoost 1.2 - single instance GPU training is available<br/>
                                        - P3 instance type - set "tree_method" hyperparameter to gpu:hist<br/>
                                        - trains more quickly and can be more cost effective<br/>
                                        - if training needs to be done in single instance GPU this is the best choice<br/><br/>
                                        
                                        - use cases: for discrete classification problem statement (ex: boolean/binary) hyperparameter to "reg:logistic" or "reg:linear" for answers that are quantitative in nature<br/><br/>
                                        
                                    Seq2Seq - sequence to sequence <br/>
                                        - takes sequence of tokens as inputs and outputs<br/>
                                        - ex: sequence of words translations<br/>
                                        - text summarization that corresponds to words in document<br/>
                                        - speeech text, tokenized audio to words<br/>
                                        - under the hood RNNS and CNNs with attention<br/>

                                        - sometimes training can take upto days<br/>
                                        - pretrained models are available. public training data sets are available<br/>

                                        - input format - RecordIO-Protobuf<br/>
                                        - tokens must be integers (since most of the algorithm want floating point)<br/>
                                        - start with tokenized text files (map every word to a number) - both vocabulary file and tokenized<br/>
                                        - samples are available<br/>
                                        - more like TF/IDF<br/><br/>
                                        
                                        - hyperparameter<br/>
                                        - batch_size<br/>
                                        - optimizer_type - adam, sgd, rmsprop<br/>
                                        - learning_rate <br/>
                                        - num_layers_encoder, num_layers_decoder<br/>
                                        - can optimize on <br/>
                                            - accuracy vs provided validation dataset<br/>
                                            - BLEU score - compares against multiple reference translations	<br/>
                                            - Perplexity - cross entropy<br/><br/>
                                            
                                        - Instance Types<br/>
                                        - heavy duty algorithms. can only use GPU instance types - P3 for examples<br/>
                                        - only single machine for training. but can use multiple GPUs on one machine<br/><br/>
                                        

                                    DeepAR - forecasting one dimensional time series data<br/>
                                        - supervised learning algorithm<br/>
                                        - Uses RNNS; <br/>
                                        - using RNN classical forecasting methods like ARIMA (auto regressive integrated moving average or ETS (exponential smoothing) fit a single model to each individual time series. use the model to extrapolate the time series to the future<br/>
                                        - predict future stock prices<br/>
                                        - not limited single timeseries. can learn from relational multiple time series and can predict based on that<br/>
                                        - find frequencies and seasonalities<br/>
                                        - always use entire time series for training, testing and inference<br/>
                                        - do not use large values for prediction length<br/>
                                        - train on many time series and not just one <br/>

                                        - input format - json lines - gzip or parquet<br/>
                                        - each record to have strt time, target, can include categorical features<br/>
                                        - ex: start, target (list of time series values),categorical features and dynamical features  <br/><br/>
                                        
                                            
                                        - hyperparameters<br/>
                                        - mini_batch_size<br/>
                                        - epochs<br/>
                                        - learning_rate<br/>
                                        - num_cells<br/>
                                        - context_length - no of time points the model sees before making a prediction. can be a smaller than seasonalities the model will lag one year anyhow<br/><br/>
                                        
                                        - instance types<br/>
                                            - can use either GPU or CPU (c4 2x /large can be cheaper to start with)<br/>
                                            - single or multi machine<br/>
                                            - cpu only inference<br/>
                                            - may need larger instances for tuning <br/><br/>



                                    Blazing Text - supervised learning and can predict<br/>
                                        - text classification - predict labels for a sentence. ex: web search for sentence and NOT for entire document<br/>
                                        - implements Word2vec and text classification algorithms<br/>
                                        - useful for many downstream NLP tasks such as sentiment analysis, named entity recognition, machine translataion<br/>
                                        - use cases: web searches, information retrieval, ranking, document classification<br/>
                                        - use cases: spam detection: "You are a winner", "Click here to get your prize" etc.,<br/>
                                        - words that are semantically similar correspond to vectors that are close together, resulting that word embeddings capture the semantic relationships between words <br/>
                                        - Word2Vec - creates vector representation of words - to find words that are similar.<br/>
                                        - word embedding - semantically similar words are represented by vectors. close to each other . resulting vector representation of a word<br/>
                                        - this is used for NLP but is not an NLP algorithm in itself.<br/>
                                        - used in machine translation, sentiment analysis<br/>
                                        - NOTE: it works only in individual words, NOT sentences or documents<br/>

                                        - skip gram<br/>
                                        - Cbow - continuous bag of words<br/>
                                        - skip gram<br/>
                                        - batch skip gram - distributed computation over many CPU nodes  <br/><br/>
                                        
                                        - input format - <br/>
                                        for supervised learning - one sentence per line<br/>
                                        - first "word" in the sentence should be "__label__" followed by the label<br/>
                                        - also "augmented manifest text format"<br/>
                                        - Word2Vec - just wants a text file with one training sentence per line   <br/><br/>

                                        - hyperparameters<br/>
                                        - word2vec:<br/>
                                            - mode (batch_skipgram, skipgram, cbow)<br/>
                                            - learning_rate<br/>
                                            - window_size<br/>
                                            - vector_dim<br/>
                                            - negative_samples<br/><br/>
                                        - text classification<br/>
                                            - epochs<br/>
                                            - learning_rate<br/>
                                            - word_ngrams<br/>
                                            - vector_dim<br/><br/>

                                        - instance types<br/>
                                        - for cbow and skipgram - recommend single ml.p3.2xlarge<br/>
                                            - any single cpu or single gpu instance will work   <br/>
                                        - for batch_skipgram - can use single or multiple cpu instances<br/>
                                        - for text classification - c5 recommended if less than 2gb training data. for larger data sets use a single gpu instance(ml.p2.xlarge or p3 2xlarge)<br/><br/>


                                    Object2Vec<br/>
                                        - similar to word2vec (blazing text) - but on entire document  <br/>
                                        - creates lower dimensional desnse embedding of high dimensional objects<br/>
                                        - compute nearest neighbors of objects. ex: similar genre of movies, recommendataions, show items that are similar<br/>
                                        - unsupervised algorithm to identify the similarities<br/>

                                        - has two input channels (each two encoders and comparator),<br/>
                                        - encoder choices has average pooled embeddings, CNNs,bidirectional LSTM<br/>

                                        - input - data must be tokenized into integerrs<br/>
                                        - training data consists of pairs of tokens and/or sequence of tokens<br/><br/>

                                        - hyperparameters<br/>
                                        - dropout, early stopping, epochs, learning_Rate, batch_size, layers. activation function, optimizer, weight decay<br/>
                                        - Enc1_network, enc2_network - choose hcnn, bilstm, pooled embedding<br/><br/>
                                        
                                        - instance type<br/>
                                        - can only train in single machine (can have multiple GPU)<br/>
                                        - m instances<br/>
                                        - for inference recommend ml p2 large<br/>
                                        - Use "INFERENCE_PREFERRED_MODE" environment variable to optimize encoder embeddings rather than classification or regression<br/><br/>
                                        
                                        - use case: analyze large set of insurance/claims for each claim, containing few sentences (many complex related information)<br/>


                                    Object Detection<br/>
                                        - identify objects in an image<br/>
                                        - detects and classifies objects with a single deep neural network<br/>
                                        - can train from scratch use pre trained models based on imagenet<br/>

                                        - takes image as input and outputs images categories by confidence score<br/>
                                        - uses cnns, single shot multi box detector (SSD) algorithm<br/>
                                        - base CNN can be VGG-16 or ResNet-50<br/>
                                        - uses flip, rescale and jitter internallty to avoid overfitting<br/><br/>

                                        - input format<br/>
                                        - RecordIO or image format (jpg or png)<br/>
                                        - JSON file for annotation data for each image  <br/><br/>

                                        - hyperparameters<br/>
                                        - mini_batch_size<br/>
                                        - learning_rate<br/>
                                        - optimizer - sgd, rmsprop, adadelta<br/><br/>
                                        
                                        - instance types<br/>
                                        - gpu instances for training (since cnns)<br/>
                                        - multi machines and multi GPUS also<br/>
                                        - for inference - use neural network once. can use c or p instances   <br/><br/>

                                        - usecase: it is not meant for image classification in the way and scale as CNN does (classify images that support multi label classification). CNN can scale millions of images at high resolution<br/><br/>

                                    Image Classification<br/>
                                        - assign one or more labels to an image<br/>
                                        - doesnt tell you where objects are just what objects are in the image<br/>

                                        - can train from scratch<br/>
                                        - under the hood uses ResNet CNN<br/>
                                        - full training mode - network initialized with random weights<br/>
                                        - transfer learning mode - pre trained weights, topo fully connected layer is initialized with random weights. network is fine tuned with new training data<br/>
                                        - default image size is 3 channel- RGB - 224x224 (ImageNet's dataset)   <br/><br/>
                                        
                                        - input format<br/>
                                            - Apache MxNet RecordIO - Not Protobuf. this is for interoperability with other deep learning frameworks<br/>
                                            - Raw jpeg or png images. also provide ".lst" to mention the files  (index, class label, path to the image)<br/>
                                            - Augmented manifest image format - pipe mode<br/><br/>

                                        - hyperparameters<br/>
                                        - batchsize, learning_rate, optimizer<br/>
                                        - optimizer specific - weight decay, beta_1, beta_2, eps, gamma<br/><br/>

                                        - instance types<br/>
                                        - gpu trainings (p2/p3).<br/>
                                        - multi gpus or multiple machines<br/>
                                        - for inference cpu or gpu is fine<br/>
                                        
                                        - usecase: any time with millions/scale of image identifying (classify across millions of images/globe etc.,) then image clasification. Object detection will NOT scale.<br/><br/>


                                    Semantic Segmentation<br/>
                                        like Image classification - what objects are in it but takes it further more -  "pixel level object classification"<br/>
                                        - different from image classification - that assigns label to whole images<br/>
                                        - different from object detection - that assigns labels to bounding boxes<br/>
                                        - useful for self driving vehicles, medical imaging diagnostics, robot sensing<br/>
                                        - produces a "segmentation" mask.<br/>

                                        - built on MXNet Gluon and Gluon CV<br/>
                                        - 3 algorithms<br/>
                                        - FCN - fully convolutional network<br/>
                                        - PSP - pyramid scene parsing<br/>
                                        - DeepLabV3<br/>
                                        - Choices of backbones:<br/>
                                        - ResNet50<br/>
                                        - ResNet101<br/>
                                        - Both trained on ImageNet<br/>
                                        - Incremental training, training from scratch, supported too <br/><br/>

                                        - input format<br/>
                                        - jpg/png annotations<br/>
                                        - for both training and validation<br/>
                                        - label maps to describe annotations<br/>
                                        - augmented manifest image format supported for pipe mode<br/>
                                        - jpg images accepted for inference<br/><br/>

                                        - hyperparameters<br/>
                                        - epochs, learning rate, batch size, optimizer etc.,<br/>
                                        - algorithm<br/>
                                        - backbone<br/><br/>

                                        - instance types<br/>
                                        - more restricted.. only GPU supported for training (p2 or p3) on a single machine only<br/>
                                        - inference on cpu (c5 or m5) or GPU (p2 or p3)
                                        <br/>


                                    RANDOM CUT FOREST (RCF)<br/><br/>

                                        - amazons algorithm for  "Anomoly Detection"<br/>
                                        - unsupervised set<br/>
                                        - process a series of data and find anomolies<br/>
                                        - breaks in periodicity, detect unexpected spikes in time series data<br/>
                                        - unclassifiable data points<br/>
                                        - assigns on "anomal score" to each data point<br/>

                                        - creates a forest of trees<br/>
                                        - each tress is a partition of the training data<br/>
                                        - when data is added, looks at expected change in complexity of the tree<br/>
                                        - data is sampled randomly<br/>
                                        - then trained<br/>
                                        - RCF shows in kinesis analytics as well<br/>
                                        - it can work on streaming data too<br/><br/>

                                        - input-format<br/>
                                        - csv or RecordIO-protobuf<br/>
                                        - optional test channel for computing accuracy, precision, recall and F1 on labeled data (anomaly or not)<br/><br/>



                                        - hyperparameters<br/>
                                        - num_trees - increasing reduces noise<br/>
                                        - num_samples_per_tree - chose so that 1/num_samples_per_tree approximates the ration of anomalous to normal data<br/><br/>

                                        - instance type<br/>
                                        - does not take advantage of GPUs<br/>
                                        - use m4, c4 or c5 for training<br/>
                                        - ml.x5.xl for inference<br/><br/>



                                    Neural Topic Model<br/>
                                        - organize documents into topics<br/>
                                        - classify or summarize documents based on topics<br/>
                                        - not just TF/IDF<br/>
                                        - bike, car, train, mileage, speed might classify a document as "transportation" for example<br/>
                                        - unsupervised<br/>
                                        - algorithm is "Neural variational inference"<br/>

                                        - Four data channels - "train" is required, "validation", "test" and "auxiliary" optional<br/>

                                        - Define how many topics you want<br/>
                                        - these topics are a latent representation based on top ranking words<br/>
                                        - one of two modeling algorthms in sagemaker<br/><br/>

                                        - input-format<br/>
                                        - RecordIO-protobuf or CSV<br/>
                                        - Words must be tokenized into integers - document must contain a count of every word in vocabulary in csv, "auxiliary" chamnnel is for vocabulary<br/>
                                        - File or pipe mode<br/><br/>

                                        - hyperparameters<br/>
                                        - lowering mini_batch_size, learning_rate can reduce validation loss - at the expense of training time<br/>
                                        - num_topics<br/><br/>

                                        - instance types  <br/>
                                        - GPU or CPU<br/>
                                        - GPU for training<br/>
                                        - CPU ok for inference (for training)<br/><br/>

                                        - usecase: to improve quality of searches fo a library of documents. use ML to identify the key topics for each documents (uploaded in pdf, rtf, ascii text etc.,)<br/>

                                    LDA  - Another topic model (not based on deep learning)<br/>
                                        - Latent Dirichlet Allocation<br/>
                                        - unsupervised<br/>
                                        - topics are unlabeled, they are just grouping of documents with a shared subset of words<br/>
                                        - can be used for things other than words<br/>
                                        - cluster customers based on purchases<br/>
                                        - harmonic analysis in music<br/>

                                        - unsupervised. generates however many topics you specify<br/>
                                        - optional test channel can be used for scoring results - per-word log likelihood<br/>
                                        - functionally similar to ntm, but cpi-based, so may be cheaper/more efficient<br/><br/>

                                        -input-format<br/>
                                        - train channel and optional test channel<br/>
                                        - RecordIO-protobuf or CSV<br/>
                                        - each document has counts for every word in vocabulary (in csv format)<br/>
                                        - pipe mode only supported with RecordIO<br/><br/>

                                        - hyper_parameters<br/>
                                            - num_topics - note these are not human readable documents. internal grouping of documents<br/>
                                            - Alpha0 - initial guess for concentration parameter. smaller values generate sparse topic mixture. larger values (>1.0) produce uniform mixtures<br/><br/>


                                        - instance type<br/>
                                            - single instance cpu training<br/><br/>

                                        - usecase: to improve quality of searches fo a library of documents. use ML to identify the key topics for each documents (uploaded in pdf, rtf, ascii text etc.,)<br/>
                                        

                                    KNN - K nearest neighbors<br/>
                                        - simplest classification or regression algorithm<br/>
                                        - this is a supervised technique<br/>
                                        - this is an index based algorithm, uses non parametric method for classification or regression<br/>
                                        - Classification - Find the K closest point to a sample point (may be using some distance metric) and return the most frequent label<br/>
                                        - Regression - Find the K closest points to a sample point and return the average value<br/>


                                        - Data is first sampled<br/>
                                        - sagemaker includes dimensionality reduction - avoid sparse data (curse of dimensionality), at cost of noise/accuracy, "sign" or "fjlt" methods<br/>
                                        - build an index for looking up neighbors<br/>
                                        - serialize the model<br/>
                                        - query the model for a given K<br/><br/>

                                        - input-format<br/>
                                        - train channel contains your data<br/>
                                        - RecordIO-protobuf or CSV - First column is label<br/>
                                        - Test channel emits accuracy or MSE<br/>
                                        - File or pipe mode on either<br/><br/>

                                        - hyperparameters<br/>
                                        - K - how many neighbors to look at<br/>
                                        - sample_size<br/>
                                        - important: feature_dim, k, predictor_type, sample_size, dimensionality_reduction_target<br/><br/>

                                        - instance types<br/>
                                        - training on cpu or gpu<br/>
                                        - inference - cpi for lower latency, gpu for higher throughput on large batches<br/><br/>



                                    K-Means<br/>
                                        - KNN was technically supervised technique<br/>
                                        - expects tabular data. rows represent observations that you want to cluster and columns represent attributes<br/>
                                        - n attributes in each represents a point in n-dimensional space<br/>
                                        - Euclidean distance between the points represents the similarity of the corresponding observations<br/>
                                        - groups observations with similar attribute values(the points corresponding to these observations are closer together)<br/>
                                        - unsupervised clustering technique<br/>
                                        - divide data into K groups where members of a group are as similar as possible to each other. we can define what is "similar"<br/>
                                        - measured by "Euclidean" distance<br/>
                                        - use case; using census data find clusters of populations in counties across the us to focus political activity<br/>
                                        - web-scale-k-means clustering<br/>
                                        - doing this at large scale can be challenging. sagemaker helps<br/>


                                        - map everu observational to n-dimensional space (n= number of features_<br/>
                                        - works to optimize the center of K clusters<br/>
                                            K = k*x<br/>
                                            NOTE: small "k" - actual no of clusters we want to endup,<br/>
                                                big "K" - clusters will be working with<br/>
                                                "x" - extra cluster centers<br/><br/>
                                        - algoritm - <br/>
                                            - determine initial cluster centers: random or k-means++ approach- Tries to make initial clusters far apart<br/>
                                            - iterate over training data and calculate cluster centers<br/>
                                            - reduce clusters from "K" to "k" - using Lloyd's method with kmeans++<br/><br/>

                                        - input-format<br/>
                                        - train channel, optional test. Train "ShardedByS3Key", test "FullyReplicated"<br/>
                                        - RecordIO-protobuf or CSV<br/>
                                        - file or Pipe on either<br/><br/>


                                        - hyperparameters<br/>
                                        - K - plot within-cluster sum of squares as function of K<br/>
                                            - use "elbow method"<br/>
                                            - optimize for tightness of clusters<br/>
                                        - mini_batch_size<br/>
                                        - extra_center_factor<br/>
                                        - init_method<br/><br/>

                                        - instance types<br/>
                                        - cpu or gpu. but gpu is recommended<br/>
                                        - only one gpu per instance used on gpu<br/>
                                        - use p*.xlarge if you are using gpu<br/><br/>

                                    NOTE: Difference between K-means and K-nearest neighbor algorithm<br/>
                                    - In knn - find items/groups that are similar to each other<br/>
                                    - in kmeans - find items/groups are similar to each other. BUT different from members of other groups. Additional modeling is done to differentiate segments (ex: group similar customer but that have different purchase history)<br/>

                                    PCA - Principal Component analysis<br/>
                                        - unsupervised<br/>
                                        - Dimension Reduction - project higher-dimensional (lots of features) into lower dimensional (like a 2D plot) while minimizing loss of information<br/>
                                        - reduced dimensions are called "components". first component has largest possible variability. second has the next largest...<br/>

                                        - Covariance matrix is created, then singular value decomposition (SVD)<br/>
                                        - two modes<br/>
                                        - regular - for sparse data, moderate number of observations and features<br/>
                                        - randomized - for large number of observations and features. uses approximation algorithm<br/><br/>

                                        - input -format<br/>
                                        - RecordIO-protobuf or CSV<br/>
                                        - file or Pipe on either<br/>

                                        - hyperparameters<br/>
                                        - algorithm_mode<br/>
                                        - subtract_mean - unbias data<br/><br/>

                                        - instance types<br/>
                                        - cpu or gpu.  depends on the specifics of the input data<br/>


                                    Factorization Machines - dealing with sparse data<br/>
                                        - we might know only few info but (ex: products recommendation)<br/>
                                        - click prediction<br/>
                                        - recommendation system - products/pages that are purchased/visited<br/>
                                        - supervised - classification or regression<br/>
                                        - works on 2 dimenstions<br/>
                                        - limited to "pair-wise" interactions - user to item for example<br/>

                                        - matrix that matches dimensions, what factors we can use to predict a classification<br/>
                                        - ex: click or not, purchase or not, users and items<br/>
                                        - usually used in the contextof recommendation/recommender systems<br/><br/>

                                        - input-format<br/>
                                        - RecordIO-progobuf with Float32<br/>
                                        - sparse data means CSV is not practical<br/><br/>

                                        - hyperparameters<br/>
                                        - bias, factors, linear terms<br/>
                                        - uniform, normal or constant<br/>
                                        - can tune properties of each<br/><br/>

                                        - instance types<br/>
                                        - cpu or gpu.<br/>
                                        - cpu recommended<br/>
                                        - gpu only works with dense data<br/>
                                        <br/>


                                    IP insights in Sagemaker	  <br/>
                                        - finding fishy/suspicious behavior<br/>
                                        - login attempts, identify accounts creating resources from anonymous IPs<br/>
                                        - like fraud detection<br/><br/>

                                        - input-format<br/>
                                        - can take "user names", "account IDs" - can be fed directly. no need to pre-process<br/>
                                        - training channel. optional validation (computes AUC score)\<br/>
                                        - CSV only - entity,IP<br/>

                                        - uses neural network to learn latent vector representation of entities and ip addresses<br/>
                                        - entities are hashed and embedded - need sufficient large hash size<br/>
                                        - automatically generates negative samples during training by randomly pairing entities and IPs<br/><br/>

                                        - hyperparameters<br/>
                                        - num_entity_vectors - hazh size, set to tiwce the number of unique entity identifiers<br/>
                                        - vector_dim - size of embedding vectors, scales model size, too large results in overfitting<br/>
                                        - epochs, learning rate, batch size<br/><br/>

                                        - instance types<br/>
                                        - cpu or gpu. <br/>
                                        - gpu recommended<br/>
                                        - can use multiple gpus<br/>
                                        - size of cpu instance depends on vector_dim and num_entity_vectors<br/><br/>


                                    REINFORCEMENT LEARNING<br/>
                                        - we dont train or deployment<br/>
                                        - internally runs like an agent that "explores" some state<br/>
                                        - yields fast online performace once the space is explored<br/>
                                        - checks for the "state" changes" and correspoding "actions" based on which "reward" - positive or penalty<br/>
                                        - builds the model based on above and deploys the training<br/>
                                        - ex: PacMan game, where the pacman can go. will it be eaten by the host for every right/left/updown move<br/>
                                        - other use cases - supplu chain, hvac systems, industrial robotics, dialog systems, autonomous vehicles<br/><br/>

                                    - uses deep learning framework with tensorflow and MXNet<br/>
                                    - supports intel coach and ray Rllib toolkits<br/>
                                    - custom, open source commercial environments supported - matlib, simulink, energyplus, roboschool, pybullet, amazon sumerican, aws robomaker etc.,<br/>

                                    - distributed training with sagemaker - can distribute training and/or environment rollout, multicore or multi instance <br/>

                                    - environment - layout board, maze etc<br/>
                                    - state - where the player/pieces are<br/>
                                    - action - move in a given direction, etc.<br/>
                                    - reward - associated with the action from the state<br/>
                                    - observation - surroundings in a maze, state of chess board etc<br/><br/>
                                        
                                    Q-Learning<br/>
                                        - specific implementation of reinforcement learning<br/>
                                            - "states" - s<br/>
                                            - "actions" - possible actions - "a"<br/>
                                            - "value of each state/action" - "Q"<br/>
                                        - start off "Q" with 0<br/>
                                        - explore the space<br/>
                                        - if bad happens after a given state/action  - reduce (penalize) its Q<br/>
                                        - as good/rewards happen after a given state/action - increase(reward) its Q<br/>
                                        - you can also "look ahead" more than one step by using a discount factor when computing "Q"<br/>
                                        exc: s - previous state, s' - current state<br/>
                                        Q(s,a) += discount * (reward(s,a) + max(Q(s') - Q(s,a))<br/><br/>
                                        <br/>
                                        Exploration problem<br/>
                                        - simple approach - choose action for a given state with highest Q. if there is a tie choose random<br/>
                                            - it can be inefficient. might miss a lot of paths that way<br/>
                                        - "Epsilon" term - introduce a value that dont follow the highest Q<br/>
                                            - choosing an epsilon value can be tricky<br/><br/>

                                    Markov Decision Process - MDP<br/>
                                        - provide mathematical framework for modeling decision making in situations where outcomes are partly random and partly under the control of decision maker<br/>
                                        - States are s and s'<br/>
                                        - state transitions functions = Pa (s, s')<br/>
                                        - Q values as discrete functions = Ra(s, s')<br/><br/>

                                        MDP is a "discrete time stochastic control process"<br/><br/>

                                    NOTE: Reinforcement learning, Q-learning, Markov Decision process, Dynamic programming are similar concepts <br/>
                                    where semi-randomly explore different choices of movenments (actions) given different conditionss (states)<br/>
                                    Keep track of reward of penalty with each choice of given state/action (Q). use stored Q for future choices<br/><br/>



                                    - hyperparameters<br/>
                                    - parameters of your choosing may be specific and may be abstracted<br/>
                                    - hyperparameter tuning in sagemaker can then optimize them<br/><br/>

                                    - instance types<br/>
                                    - no specific guidance <br/>
                                    - since its deep learning gpus can be helpful<br/>
                                    - supports multiple instances and cores<br/><br/>

                                    Custom Algorithms<br/>
                                    - Can create your own SageMaker algorithm resource when none of the built in algorithams available in SageMaker fit your problem. <br/>
                                    - Steps<br/>
                                    - create docker containers for your training and inference code<br/>
                                    - specify the hyperparameters that your algorithm supports<br/>
                                    - specify the metrics that your algorithm sends to Cloudwatch when training<br/>
                                    - instance types your algorithm supports for training and inference. <br/>
                                    - whether your algorthm supports distributed training across multiple instances (NOT distributed inferences)<br/><br/>


                                    In general<br/>
                                    - Deep learning algorithms - GPU instances (P2 or P3) for training<br/>
                                    - Inferences - running neural networks are less demanding - compute instances (C4, C5)<br/>
                                    - GPU instances can be really pricey<br/>
                                    - EC2 spot instances (can be interrupted) can save upto 90% over on demand<br/>

                                    <br/>
                               </div>
                           
                                   
                               </AccordionDetails>
                       </Accordion>    
                       
                       <Accordion >
                           <AccordionSummary>
                               <b>AWS AI/ML Services</b>
                              
                           </AccordionSummary>
                           <AccordionDetails>
                               <div>
                                    Higher level AI ML Services<br/><br/>

                                    Amazon comprehend<br/>
                                        - NLP and text analytics<br/>
                                        - input social media, emails, web pages, documents, transcripts and medical recors (comprehend medica)<br/>
                                        - extract key phrases, entities, sentiment, language, syntax, topics and document classifications<br/>
                                        - can train on your own data<br/>
                                        - identifies the language of the text, extracts key phrases, people, brands, events. identify positivity or negativity. <br/>
                                        - analyzes the text using tokenization and parts of speech and automatically organizes a collection of text files by topic<br/>
                                        - along with "AutoML" to build customset of entities or text classification models that are tailored uniquely for specific needs<br/>
                                        - usecase: reviews that do not contain any offensive or unsafe content, such as obscenities, threatening language<br/><br/>

                                    Amazon Translate<br/>
                                        - neural machine translation service that translates text. uses deep learning for translation<br/>
                                        - supports custom terminology - in csv, tmx format. <br/>
                                        - appropriate for proper names, brand names, etc.<br/><br/>

                                    Amazon Transcribe<br/>
                                        - speech to text.<br/>
                                        - input in flac, mp3, mp4 or wav in specific language<br/>
                                        - streaming audio supported (http/2 or web socket - french, english, spanish)<br/>
                                        - speaker identification - specify number of speakers<br/>
                                        - channel identification - two callers can be transcribed separately. merging based on timing of "utterances"<br/>
                                        - custom vocabularies - vocabulary list - special words like acronyms, vocabulary tables (can include "SoudsLike", "IPA" and "DisplayAs")<br/>
                                        - usecase: mobile streaming app- Transcribe HTTP/2 streaming client can handle retrying connections when there are intermittent problems on the network. "StartStreamTranscription" API call with bidirectional HTTP/2 streams audio to Amazon Transcribe<br/><br/>

                                    Note: sample use cases<br/>
                                    - English audio sentiment analysis: transcribe and comprehend<br/>
                                    - spanish audio for english speaker sentiment analysis - transcribe, translate and comprehend<br/><br/>

                                    Amazon Polly<br/>
                                        - text to speech<br/>
                                        - neural text to speech, many voices and languages<br/>
                                        - lexicons - customize pronunciation of specific words and phrases. ex: world wide web consortium instead of w3c<br/>
                                        - ssml - alternative to plain text. <br/>
                                        - speech synthesis markup language<br/>
                                        - gives control over emphasis, pronunciation, breathing, whispering, speech rate, pitch, pauses<br/>
                                        - speech marks<br/>
                                        - can encode when sentence/word starts and ends in an audio stream<br/>
                                        - useful for lip-synching animation<br/><br/>


                                    Amazon Rekognition<br/>
                                        - computer vision<br/>
                                        - object and scene detection - can use your own face collection<br/>
                                        - image moderation<br/>
                                        - facial analysis<br/>
                                        - celebrity recognition<br/>
                                        - face comparison<br/>
                                        - text in image<br/>
                                        - video analysis<br/>
                                        - object/people/celebrities marked on timeline<br/>
                                        - people pathing<br/><br/>
                                        - NOTE: cannot perform speech to text tranformation. for those uses cases (ex: analyze a spanish video for an english speaker and understand sentiment) use transcribe (speech to text) and comprehend (analyze spanish text and evaluate) and use translate (to english)   <br/><br/>

                                        - images from S3 or provide image bytes as part of request. s3 is faster<br/>
                                        - facial recognition depends on lighting, angle, visibility of eyes, resolution etc.,<br/>
                                        - video comes from kinesis video streams. H.264 encoded, 5-30 FPS, Favor resolution over framerate<br/>
                                        - can use with lambda to trigger image analysis upon upload<br/><br/>


                                        Rekognition custom labels <br/>
                                        - train with a small set of labeled images to train<br/>
                                        - use your own labels for unique items<br/>
                                        - example nfl uses custom labels to identify team logos, pylons and foam fingers in images<br/><br/>

                                    Amazon Forecast <br/>
                                    - time series analysis service<br/>
                                    - fully managed service to deliver highly accurate forecasts with ML<br/>
                                    - "AutoML" chooses best model for your time series data - ARIMA, DeepAR, ETS, NPTS, Prophet, CNN-QR<br/>
                                    - Note: algorithms like prophet, DeepAR+, ARIMA does not accept related time series without future values, use CNN-QR in those cases<br/>
                                        - Prophet - suited for time series with strong seasonal effects and several seasons of historical data<br/>
                                        - ntps - suited for parse or intermittent time series<br/>
                                        - arima - suited for simple datasets with under 100 time series. <br/>
                                        - deep ar - suited for forecasting like stocks. can have more administative effort <br/>
                                        - deep ar+ - suited for large datasets containing 100s of feature time series. works with forward looking related time series. forecast would require less administrative overhead than sagemaker in these cases. ex: forecast stock price movement with futures<br/>
                                        - cnn qr - suited for datasets based on historical time series but with future values.  forecast stock price movement where historical related time series is used to improve the accuracy<br/>
                                    - works with any time series<br/>
                                    - price, promotions, economic performance<br/>
                                    - can combine with associated data to find relationships<br/>
                                    - inventory planning, financial planning, resource planning<br/>
                                    - based on "dataset groups", "predictors" and "forecasts"<br/><br/>

                                    Amazon Lex<br/>
                                        - billed as inner workings of alexa<br/>
                                        - natural language chatbot engine<br/>
                                        - a bot is built around intents<br/>
                                        - utterances invoke intents ("ex: i want to order a pizza")<br/>
                                        - lambda functions are invoked to fulfill the intent<br/>
                                        - slot specify extra information neede by the intent ("ex pizza size toppings, crust, when to deliver etc)<br/>
                                        - can deploy to AWS mobile sdk, facebook messenger, slack and twilio<br/><br/>

                                    Amazon Personalize<br/>
                                    - recommender system<br/><br/>

                                    Amazon Textract<br/>
                                        - OCR with forms, fields, tables support<br/><br/>

                                    AWS DeepRacer<br/>
                                        - Reinforcement learning powered 1/18-scale race car<br/><br/>

                                    DeepLens<br/>
                                        - deep learning-enabled video camera<br/>
                                        - integrated with Rekognition, SageMaker, Polly, tensorflow, mxnet, caffe<br/>
                                        - deeplens outputs to kinesis video streams<br/><br/>
                                    <br/>
                                    AWS DeepComposer<br/>
                                        - AI powered midi keyboard<br/>
                                        - composes a melody into an entire song<br/>
                                        - for educational purposes
                                        <br/><br/>

                                    Amazon Fraud Detector<br/>
                                        - upload your own historical fraud data<br/>
                                        - builds custom models from a template you choose<br/>
                                        - expose an api for your online application<br/>
                                        - assess risk from - new accounts, guest checkout, "try before you buy" abuse and online payments<br/>
                                        - Models - ONLINE_FRAUD_INSIGHTS is available while building the fraud detector model<br/><br/>

                                    Amazon CodeGuru<br/>
                                        - automated code reviews<br/>
                                        - finds lines of code that hurts performance<br/>
                                        - resource leaks, race conditions<br/>
                                        - offers specific recommendations<br/>
                                        - powered by ml<br/>
                                        - currently java<br/><br/>

                                    Contact lens for Amazon Connect<br/>
                                        - for customer support call centers<br/>
                                        - transcribe/ingest audio data from recorded calls<br/>
                                        - allow search on calls/chats<br/>
                                        - sentiment analysis<br/>
                                        - find "utterances" that tolerate with successful calls<br/>
                                        - categorize calls<br/>
                                        - measure talk speed and interruptions<br/>
                                        - theme detection - discovers emerging issues<br/><br/>

                                    Amazon Kendra<br/>
                                        - enterprise search with natural language<br/>
                                        - ex: where is the it support desk, hhow do i connect to my vpn<br/>
                                        - combines data from file systems, sharepoint, intranet,sharing services (jdbc./s3) into one searchable repository<br/>
                                        - ML powered - thumbs up/down feedback<br/>
                                        - relevance tuning - boost strength of document freshness, view counts etc.,<br/>
                                        - use case: build index searchable document repository from unstructured document types like html, ppt, word, plain text, pdf documents<br/>
                                        - limits: total size on s3 bucket is 50MB, text extracted from an individual document cannot exceed 5MB<br/><br/>

                                    Amazon Augmented AI (A2I)<br/>
                                        - human review of ML predictions<br/>
                                        - similar to groudtruth but very general purpose<br/>
                                        - builds workflows for reviewing low-confidence predictions<br/>
                                        - access the mechanical Turn work force or vendors<br/>
                                        - integrated into amazon textract and rekognition<br/>
                                        - integrates with sagemaker<br/><br/>


                                    cat plot - to show relationship between numerical and one or more categorical variables using visualization such as violinplot, boxenplot, etc.<br/>
                                    swarm plot - to show categorical scatter plot data that shows the distribution of values for each feature<br/>
                                    pairs plot - to show the relationship between pairs of features and distribution of one of the variables in relation to the  other. ex: to see which feature correlates wll with other features<br/>
                                    covariance matrix - to show the degree of correlation between two features. visualzation gives numerical representation of the correlation where the pairs plot gives you a visual representation as points plotted in two-dimensional space<br/>
                                    entropy matrix - measure of randomness in your features. <br/><br/>


                                    Sample use cases<br/>
                                    - Build your alexa - Transcribe to Lex to Polly<br/>
                                    - your language translator - Transcribe to Translate to Polly<br/>
                                    - Celebrity detector - DeepLens - Rekognition<br/>
                                    - Are people on the phone happy - Transcribe to Comprehend<br/><br/>


                                    AWS Glue<br/>
                                    - AWS glue with lakeformation can be an effective solution<br/>
                                    - glue crawler to build glue catalog, s3 upload event to trigger lambda function that will start glue crawler, cloudwatch event trigger can start glue etl job that process/transforms data into S3 data lake<br/>
                                    - aws glue can be used to build a series of transforms that uses DynamicFrames to pass the data from transform to transform. each transform can perform  a different cleaning and/or transformating task<br/>
                                    - FindMatches ML aglrothm can be used in many usecases.  ex: several car models/product lines with many similarities and also differences. incoming data/csv file, create your labeling file used to train your FindMatches to transform using an AWS Glue transform job<br/>
                                    - labeling file has to be in CSV first two columns are labeling_set_id and label, remaining columns must match the schea of the data to be processed. must be encoded as utf-8 without BOM<br/>
                                    - using Spark ML jobs within AWS Glue to build feature transformation code. following ML packages/engines can be used for transfomer tasks<br/>
                                    - MLeap can be added to inference pipelines<br/>
                                    - MLLib lets you build ML pipeline components to transform your data using full suite of standard transformers like tokenizers, OneHotEncoders, normalizers, etc.,<br/>
                                    - SparkML serving container allows you to deploy an apache spark ml pipeline in sagemaker<br/>
                                    - usecase: sentiment analysis using comprehend. S3 -> AWS Glue ETL -> Comprehend -> S3 -> SageMaker<br/>



                                    <br/>
                               </div>
                           
                                   
                               </AccordionDetails>
                       </Accordion>    
                       <Accordion>
                           <AccordionSummary>
                              <b>Visualizations</b>
                              
                           </AccordionSummary>
                           <AccordionDetails>
                               <div>
                                    AI ML - Visualizations<br/><br/>
                                    Types of information to BUI based on<br/>
                                    - KPIs - key performance indicator<br/>
                                    - Relationships<br/>
                                    - Comparisons<br/>
                                    - Distributions<br/>
                                    - Compositions<br/><br/>


                                    Charting Data - KPI<br/>
                                    - single value that represents a particular are or function and shows relative performance<br/>
                                    - ex: conversion rate (free to membership), relative market share, net profit margin, <br/><br/>

                                    Charting Data - Relationships<br/>
                                    - establish or prove a relationship between 2 or more variables<br/>
                                    - Charts<br/>
                                    - Scatter chart - compare two variables. ex: social media spend to adoption rate<br/>
                                    - Bubble chart  - compare three variables. ex: comparing investment return, investment duration and investment commitment. bubble size will be based on one of the variables<br/><br/>
                                    
                                    Charting data - comparison<br/>
                                    - show how variables change over time or show a static view of how different variables compare<br/>
                                    - Charts<br/>
                                    - Bar chart - compare one variable. ex: website hits in a given month<br/>
                                    - Column Chart - compares one or two variables changing over time. ex: show year over year sales and number of marketing campaigns<br/>
                                    - Table - compare three variables. ex: two dimensions represent rows and columns. third by the data in the cells<br/>
                                    - line chart - compares three or more variables changing over time. ex: show year over year sales, number of marketing campaigns and web traffic<br/><br/>
                                    
                                    Charting Data - Distributions<br/>
                                    - show how data is distributed over defined intervals.<br/>
                                    - Interval means clustering or grouping. NOT time<br/>
                                    - Charts<br/>
                                    - Column Historgram: One variable. ex: how many voters are in various generation groups, counting something and putting them into buckets<br/>
                                    - Scatter Chart: two variables. ex: relating investment duration/time (ex: x axis), return on investment (ex: y axis),  and investment size (bubble or scatter size)<br/><br/>
                                    
                                    Charting Data - Compositions<br/>
                                    - show the elements that make up data set, static or changing over time<br/>
                                    - Charts<br/>
                                    - Pie Chart - simple share of total<br/>
                                    - Stacked 100% Bar chart - components of components<br/>
                                    - Tree Map - share of total<br/>
                                    - Stacked Area Chart - 5  or more periods; relative and absolute differences<br/>
                                    - Stacked 100% area chart - 5 or more periods; relative differences<br/>
                                    - Stacked Column chart - less than 5 periods. relative and absolute differences<br/><br/>
                                    

                                    CloudWatch Metrics can be used to visualize based on the logs<br/>
                                    sample usecase: keras model to visualization code to monitor the training metrics of the model<br/>
                                    Steps<br/>
                                    - while creating model training job, specify regex pattern for metrics and write to your logs. you cannot directly specify metrics this has to be done through regex pattern<br/>
                                    - Using cloudwatch metrics dashboard visualize the metrics that sagemaker automatically parse from the logs <br/>
                                    - publish graphs and visualization<br/><br/>

                                    Visualization using K-Means<br/>
                                    gain acces to the python methods that allow you to visualize your metrics in charts<br/>
                                    - use SageMaker python module - sagemaker.analytics<br/>
                                    - import TrainingJobAnalytics<br/>
                                    - metrics - test:msd or test:ssd<br/>
                                    
                                    <br/>
                               </div>
                           
                                   
                               </AccordionDetails>
                       </Accordion>    
                       
                       <Accordion >
                           <AccordionSummary>
                               <b>Operations and Implementations</b>
                              
                           </AccordionSummary>
                           <AccordionDetails>
                               <div>
                                   
                                    Machine Learning implementation and operations<br/><br/>

                                    - deploy to prod - scale reliably and secure<br/>
                                    - SageMaker - interacts with container, elastic inference, <br/>
                                    - SageMaker Neo <br/>
                                    - IAM, KMS, private VPC<br/>
                                    - EC2, A/B tests
                                    <br/><br/>
                                    SageMaker and Docker Containers<br/>
                                        - all models in sagemaker are hosted in docker containers<br/>
                                        - pre-built deep learning<br/>
                                        - pre-built scikit-learn and spark ML<br/>
                                        - pre-built tensorflow, MXNet, chainer, pytorch - distributed training via Horovod or Parameter servers<br/>
                                        - NOTE: - tensorflow does not get distributed across machines/GPUS. Horovoid or Parameter servers<br/>
                                        - your own training and inference code. or extend pre-built image<br/>
                                        - allows to use any script/algorithm within sagemaker regardless of runtime or language<br/>
                                        - containers are isolated and contain all dependencies<br/><br/>


                                        structure of a training container<br/><br/>
                                        <br/>
                                        ```<br/>
                                            /opt/ml<br/>
                                            /opt/ml/input<br/>
                                            /opt/ml/input/config<br/>
                                            /opt/ml/input/config/hyperparameters.json<br/>
                                            /opt/ml/input/config/resourceconfig.json<br/>
                                            /opt/ml/input/data/channel_name/input+data<br/>
                                            /opt/ml/model<br/>
                                            /opt/ml/code/script files<br/>
                                            /opt/ml/output<br/>
                                            /opt/ml/output/failure<br/>
                                        ```<br/>

                                        structure of a deployment container<br/><br/>

                                        ```<br/>
                                            /opt/ml/<br/>
                                            /opt/ml/model<br/>
                                            /opt/ml/model/model files<br/>
                                        ```
                                        <br/><br/>

                                        WORKDIR<br/>
                                        - nginx.conf<br/>
                                        - webserver thats running<br/>
                                        - predictor.py<br/>
                                        - flask web server running at runtime predicting<br/>
                                        - serve/<br/>
                                        - gunicorn server multiple instances of the scripts<br/>
                                        - train/<br/>
                                        - program that runs the training. can implement your own training algorithm<br/>
                                        - wsgi.py<br/>
                                        - wrapper to invoke the flask application to serve the results<br/><br/>


                                        Dockerfile<br/><br/>

                                        ```<br/>
                                        FROM tensorflow/tensorflow:2.0.0a0<br/><br/>

                                        RUN pip install sagemaker-containers<br/><br/>

                                        # Copies the training code inside the container<br/>
                                        COPY train.py /opt/ml/code/train.py<br/><br/>

                                        # Defines train.py as script entrypoint<br/>
                                        ENV SAGEMAKER_PROGRAM train.py<br/><br/>
                                        <br/>
                                        ````<br/><br/>

                                        Other environment variables are<br/><br/>

                                        - SAGEMAKER_PROGRAM - run a script inside /opt/ml/code<br/>
                                        - SAGEMAEKER_TRAINING_MODULE<br/>
                                        - SAGEMAKER_SERVICE_MODULE<br/>
                                        - SM_MODEL_DIR<br/>
                                        - SM_CHANNELS / SM_CHANNEL_*<br/>
                                        - SM_HPS / SM_HP_*<br/>
                                        - SM_USER_ARGS<br/><br/>
                                        - .. and many more<br/><br/>

                                        <br/>
                                        To use your own image<br/><br/>

                                        ```<br/>

                                        cd dockerfile<br/>
                                        !docker build -t foo .<br/><br/>

                                        from sagemaker.estimator import Estimator<br/><br/>

                                            estimator = Estimator(image_name]'foo', role='SageMakerRole', <br/>
                                                                train_instance_count=1, train_instance_type='local')<br/><br/>

                                            estimator.fit()<br/><br/>

                                        ```<br/><br/>


                                    Production Variants - for A/B tests<br/><br/>

                                        - you can test out multiple models on live traffic using Production variants<br/>
                                        - variant weights tell sagemaker how to distribute traffic among them<br/>
                                        - so you could roll out a new iteration of your model at say 10% variant weight<br/>
                                        - once you are fine, ramp it up to 100%<br/><br/><br/>


                                    SageMaker Neo - on the Edge<br/><br/>

                                        - Train once, run anywhere<br/>
                                        - edge devices- ARM, intel nvidia processors, embedded in whatever - your car<br/>
                                        - optimizes code for specific devices - tensorflow, mxnet, pytorch, onnx, xgboost<br/>
                                        - consists of a compiler and a runtime<br/><br/>

                                        Neo + AWS IoT greengrass<br/>
                                        - hosted on c5, m5, m4, p3, p2 instances<br/>
                                        - must be same instance type used for compilation<br/><br/>

                                        OR IoT GreenGrass<br/>
                                        - this is how you get the model to an actual edge device<br/>
                                        - inference at the edge with local data, using model trained in the cloud<br/>
                                        - uses lambda inference applications<br/><br/>

                                    IoT<br/>
                                    - IOT Cores sends sensor to IoT analytics for enrichment and analysis. pre trained model is deployed into the field using IoT greengrass where we can perform ML inference using enriched data on the local devices. <br/>
                                    - IoT greengrass makes it easy to perform ML inference locally on devices using models that are created, trained and optimized in the cloud. IoT greengrass gives flexibility to use ML trained in SageMaker or to bring your own pre trained model stored in Amazon S3<br/>
                                    - IoT analytics can be built specifically for analyzing highly unstructured IoT data (note. kinesis data analytics can be used to analyzed IoT device data streams). IoT analytics is a better choice<br/>
                                    - IoT rules can help evaluate and send notifications when the peak thresholds are exceeded. rule engines listens for incoming MQTT messages that match a rule and can trigger a lambda function, sns topic or write to S3<br/>
                                    - IoT Device (send MQTT messages) -> IoT Core -> Kinesis Data Streams -> Lambda function (transform IoT message data to inference request serialization format)<br/><br/>
                                    - use case:  sensors on the ground/farmers measure temperature, humidity etc.,<br/><br/>

                                    <img width="75%" src="https://d1.awsstatic.com/reInvent/re20-pdp-tier2-3/Greengrass/product-page-diagram_AWS-Greengrass_service-in-cloud%402x.2c491da3fca25b112d6a50c551067b22bbde7028.png"></img>

                                    
                                    <br/><br/>

                                    SageMaker Inference Pipeline<br/>
                                    - fully managed<br/>
                                    - Multiple models can be created using Amazon SageMaker built in algoriths. These models need to be deployed so the inference from first model can be passed to the next model and so on<br/>
                                    - can be used to combine multiple steps like preprocessing, predictions and post processing tasks<br/>
                                    - is an amazon sagemaker model that is composed of a linear sequence of 2 to 5 containers that process requests for inferences on data. <br/>
                                    - use inference pipeline to define and deploy any combination of pretrained sagemaker built in algorithms and your own custom algorithms packed in docker containers<br/>
                                    - SageMaker handles invocations as sequence of http requests. first container in the pipeline handles the initial request, then the intermediate response is sent as a request to the second container and so on. sagemaker returns the final response to the client<br/><br/>

                                    CloudTrail Integration<br/>
                                    - seamless integration with SAGEMAKER_PROGRAM<br/>
                                    - provides a record of actions take by the user, role or an aws service in SageMaker.<br/>
                                    - captures all API calls except "InvokeEndpoint" as events. The calls captured include calls from SageMaker console, code calls to the amazon sageMaker api operations<br/>
                                    - continuous delivery to s3 bucket. if trail is not configured, most recent events in the cloudtrail console in event history<br/>

                                    <br/>
                                </div>      
                            </AccordionDetails>
                       </Accordion>    
                       
                       
               </div>              
           </div>
          
         </div>
     </div>
    );
  }
}
 
export default MLBase;