diff --git a/build/cmake/app/core.cmake b/build/cmake/app/core.cmake index 616ec04a4..3bc74d490 100644 --- a/build/cmake/app/core.cmake +++ b/build/cmake/app/core.cmake @@ -25,7 +25,7 @@ add_custom_command( OUTPUT ${STAMP_DIR}/core-build-release-dir DEPENDS ${core_files} COMMAND mkdir -p ${CORE_BUILD_DIR}/releases/0 - COMMAND rsync -a --delete-after --delete-excluded "--filter=:- ${CMAKE_SOURCE_DIR}/.gitignore" --include=/templates/etc/perp/.boot --exclude=.* --exclude=/templates/etc/test-env* --exclude=/templates/etc/perp/test-env* --exclude=/src/api-umbrella/web-app/spec --exclude=/src/api-umbrella/web-app/app/assets --exclude=/src/api-umbrella/hadoop-analytics --include=/bin/*** --include=/config/*** --include=/LICENSE.txt --include=/templates/*** --include=/src/*** --exclude=* ${CMAKE_SOURCE_DIR}/ ${CORE_BUILD_DIR}/releases/0/ + COMMAND rsync -a --delete-after --delete-excluded "--filter=:- ${CMAKE_SOURCE_DIR}/.gitignore" --include=/templates/etc/perp/.boot --exclude=.* --exclude=/templates/etc/test-env* --exclude=/templates/etc/perp/test-env* --exclude=/src/api-umbrella/web-app/spec --exclude=/src/api-umbrella/web-app/app/assets --include=/bin/*** --include=/config/*** --include=/LICENSE.txt --include=/templates/*** --include=/src/*** --exclude=* ${CMAKE_SOURCE_DIR}/ ${CORE_BUILD_DIR}/releases/0/ COMMAND touch ${STAMP_DIR}/core-build-release-dir ) diff --git a/build/package/build_package b/build/package/build_package index 8ba673639..8c44c0cca 100755 --- a/build/package/build_package +++ b/build/package/build_package @@ -49,14 +49,6 @@ if [ "$PACKAGE" == "core" ]; then fpm_args+=("--after-remove" "$source_dir/build/package/scripts/after-remove") fpm_args+=("--directories" "/opt/api-umbrella") fpm_args+=("--directories" "/etc/api-umbrella") -elif [ "$PACKAGE" == "hadoop-analytics" ]; then - for dep in "${hadoop_analytics_package_dependencies[@]}"; do - fpm_args+=("-d" "$dep") - done - - fpm_args+=("-C" "$WORK_DIR/package-dest-hadoop-analytics") - fpm_args+=("--name" "api-umbrella-hadoop-analytics") - fpm_args+=("--depends" "api-umbrella") fi mkdir -p "$PACKAGE_WORK_DIR/build/$PACKAGE" diff --git a/build/package/scripts/after-install b/build/package/scripts/after-install index 7f15526ea..9b3a9fdab 100755 --- a/build/package/scripts/after-install +++ b/build/package/scripts/after-install @@ -79,11 +79,6 @@ if [ "$configure" = "true" ]; then # Set file permissions chown -R $user:$group $prefix_dir/etc $prefix_dir/var chown -R $deploy_user:$deploy_group $embedded_dir/apps - if [ -d $embedded_dir/kylin ]; then - chown $user:$group $embedded_dir/kylin/tomcat/temp \ - $embedded_dir/kylin/tomcat/webapps \ - $embedded_dir/kylin/tomcat/logs - fi # Re-create symlinks that may have inadvertently been cleaned up by the API # Umbrella v0.8 and v0.9 after-remove scripts during upgrades (this should be diff --git a/build/package_dependencies.sh b/build/package_dependencies.sh index 5c23e6665..5e8a4ae9e 100644 --- a/build/package_dependencies.sh +++ b/build/package_dependencies.sh @@ -65,9 +65,6 @@ if [ -f /etc/redhat-release ]; then libxml2-devel libxslt-devel ) - hadoop_analytics_package_dependencies=( - java-1.8.0-openjdk-headless - ) core_build_dependencies=( autoconf automake @@ -96,9 +93,6 @@ if [ -f /etc/redhat-release ]; then unzip xz ) - hadoop_analytics_build_dependencies=( - java-1.8.0-openjdk-devel - ) test_build_dependencies=( # Binary and readelf tests file @@ -189,9 +183,6 @@ elif [ -f /etc/debian_version ]; then libxml2-dev libxslt-dev ) - hadoop_analytics_package_dependencies=( - "openjdk-$openjdk_version-jre-headless" - ) core_build_dependencies=( autoconf automake @@ -221,9 +212,6 @@ elif [ -f /etc/debian_version ]; then uuid-dev xz-utils ) - hadoop_analytics_build_dependencies=( - "openjdk-$openjdk_version-jdk" - ) test_build_dependencies=( # Binary and readelf tests file @@ -257,9 +245,7 @@ fi all_build_dependencies=( "${core_package_dependencies[@]}" - "${hadoop_analytics_package_dependencies[@]}" "${core_build_dependencies[@]}" - "${hadoop_analytics_build_dependencies[@]}" ) # shellcheck disable=SC2034 diff --git a/build/scripts/distclean b/build/scripts/distclean index 9accefe14..bb3298d41 100755 --- a/build/scripts/distclean +++ b/build/scripts/distclean @@ -20,7 +20,6 @@ rm -rf \ "$source_dir/build/work"/* \ "$source_dir/cmake_install.cmake" \ "$source_dir/install_manifest.txt" \ - "$source_dir/src/api-umbrella/hadoop-analytics"/*/dependency-reduced-pom.xml \ "$source_dir/src/api-umbrella/web-app/.bundle" \ "$source_dir/src/api-umbrella/web-app/brakeman.html" \ "$source_dir/src/api-umbrella/web-app/log"/* \ diff --git a/config/default.yml b/config/default.yml index 9ff69a878..958132cc4 100644 --- a/config/default.yml +++ b/config/default.yml @@ -210,30 +210,6 @@ analytics: adapter: elasticsearch timezone: UTC log_request_url_query_params_separately: false -kylin: - protocol: http - host: 127.0.0.1 - port: 7070 -presto: - protocol: http - host: 127.0.0.1 - port: 14015 - embedded_server_config: - http_port: 14015 - max_memory: 2GB - max_memory_per_node: 1GB - discovery_uri: http://127.0.0.1:14015 - query_max_run_time: 60s - hive: - metastore_uri: thrift://127.0.0.1:9083 -flume: - host: 127.0.0.1 - port: 14016 - embedded_server_config: - java_opts: -kafka: - brokers: [] - topic: api_umbrella_logs strip_cookies: - ^__utm.*$ - ^_ga$ diff --git a/docs/developer/analytics-architecture.md b/docs/developer/analytics-architecture.md index e91e4ad54..c76df8c79 100644 --- a/docs/developer/analytics-architecture.md +++ b/docs/developer/analytics-architecture.md @@ -16,12 +16,8 @@ To explain each step: - It can transform the data and send it to multiple different endpoints. - The storage database stores the raw analytics data for further querying or processing. -API Umbrella supports different analytics databases: - ## Elasticsearch -Suitable for small to medium amounts of historical analytics data. *(TODO: Provide more definitive guidance on what small/medium/large amounts are)* - ### Ingest Data is logged directly to Elasticsearch from rsyslog: @@ -41,132 +37,3 @@ The analytic APIs in the web application directly query Elasticsearch: ``` [api-umbrella-web-app] => [Elasticsearch] ``` - -## PostgreSQL - -**TODO: The PostgreSQL adapter doesn't currently exist, but the idea is to leverage the same SQL framework built for Kylin.** - -Suitable for small amounts of historical analytics data, or small to medium amounts of data with a columnar storage extension. *(TODO: Provide more definitive guidance on what small/medium/large amounts are)* - -### Ingest - -Data is logged directly to PostgreSQL from rsyslog: - -``` -[nginx] ====> [rsyslog] ===> [PostgreSQL] - JSON SQL -``` - -- rsyslog buffers and sends data to PostgreSQL as individual inserts. -- rsyslog's ompgsql output module is used. -- If rsyslog supports batched transactions in the future, we should switch to that: [rsyslog#895](https://github.com/rsyslog/rsyslog/issues/895) - -### Querying - -The analytic APIs in the web application directly query PostgreSQL: - -``` -[api-umbrella-web-app] ===> [PostgreSQL] - SQL -``` - -### PostgreSQL: Columnar Storage - -For better analytics performance with larger volumes of analytics data, you can continue to use the PostgreSQL adapter, but with a compatible column-based variant: - -- [cstore_fdw](https://github.com/citusdata/cstore_fdw) -- [Amazon Redshift](https://aws.amazon.com/redshift/) - -When these are used, the SQL table design and process remains the same, only the underlying table storage is changed for better analytic query performance. - -## Kylin - -Suitable for large amounts of historical analytics data. *(TODO: Provide more definitive guidance on what small/medium/large amounts are)* - -This is the most complicated setup, but it allows for vastly improved querying performance when dealing with large amounts of historical data. This is achieved by using [Kylin](http://kylin.apache.org) to pre-compute common aggregate totals. By pre-computing common aggregations, less hardware is needed than would otherwise be needed to quickly answer analytics queries over large amounts of data. Under this approach, analytics data may not be immediately available for querying, since additional processing is required. - -### Ingest - -During ingest, there are several concurrent processes that play a role: - -``` -[nginx] ====> [rsyslog] ====> [Kafka] ====> [Flume] ====> [HDFS - JSON (temp)] - JSON JSON JSON JSON -``` - -``` -[HDFS - JSON (temp)] => [API Umbrella Live Processor] => [Hive - ORC] -``` - -``` -[Hive - ORC] => [API Umbrella Kylin Refresher] => [Kylin] -``` - -- rsyslog buffers and sends JSON messages to Kafka using the [omkafka](http://www.rsyslog.com/doc/v8-stable/configuration/modules/omkafka.html) output module. - - Kafka is used as an intermediate step as a reliable way to get messages in order to Flume, but primarily Kafka is being used because that's what Kylin's future [streaming feature](http://kylin.apache.org/blog/2016/02/03/streaming-cubing/) will require (so it seemed worth getting in place now). -- Flume takes messages off the Kafka queue and appends them to a gzipped JSON file stored inside Hadoop (HDFS). - - The JSON files are flushed to HDFS every 15 seconds, and new files are created for each minute. - - The per-minute JSON files are partitioned by the request timestamp and not the timestamp of when Flume is processing the message. This means Flume could be writing to a file from previous minutes if it's catching up with a backlog of data. - - Kafka's stronger in-order handling of messages should ensure that the per-minute JSON files are written in order, and skipping between minutes should not be likely (although possible if an nginx server's clock is severely skewed or an nginx server goes offline, but still has queued up messages that could be sent if it rejoins later). - - Flume plays a very similar role to rsyslog, but we use it because it has the best integration with the Hadoop ecosystem and writing to HDFS (I ran into multiple issues with rsyslog's native omhdfs and omhttpfs modules). -- The API Umbrella Live Processor task determines when a per-minute JSON file hasn't been touched in more than 1 minute, and then copies the data to the ORC file for permanent storage and querying in the Hive table. - - The live data should usually make it's way to the permanent ORC storage within 2-3 minutes. - - The ORC data is partitioned by day. - - The data is converted from JSON to ORC using a Hive SQL command. Each minute of data is appended as a new ORC file within the overall ORC daily partition (which Hive simply treats as a single daily partition within the overall logs table). - - Since the data is only appended, the same minute cannot be processed twice, which is why we give a minute buffer after the JSON file has ceased writing activity to convert it to ORC. - - The ORC file format gives much better compression and querying performance than storing everything in JSON. - - If a new ORC file is created for a new day, the partition will be added to the Hive table. - - At the end of each day, overwrite the daily ORC file with a new, compacted file from the original JSON data. Writing the full day at once provides better querying performance than the many per-minute ORC files. By basing this daily file on the original JSON data, it also alleviates any rare edge-cases where the per-minute appender missed data. - - Automatically remove old JSON minute data once it's no longer needed. -- The API Umbrella Kylin Refresher task is responsible for triggering Kylin builds to updated the pre-aggregated data. - - At the end of each day, after writing the compacted ORC file for the full day, we then trigger a Kylin build for the most recent day's data. - -This setup is unfortunately complicated with several moving pieces. However, there are several things that could potentially simplify this setup quite a bit in the future: - -- [Kylin Streaming](http://kylin.apache.org/blog/2016/02/03/streaming-cubing/): This would eliminate our need to constantly refresh Kylin throughout the day, and reduce the amount of time it would take live data to become available in Kylin's pre-aggregated results. This feature available as a prototype in Kylin 1.5, but we're still on 1.2, and we'll be waiting for this to stabilize and for more documentation to come out. But basically, this should just act as another consumer of the Kafka queue, and then it would handle all the details of getting the data into Kylin. -- [Flume Hive Sink](https://flume.apache.org/FlumeUserGuide.html#hive-sink): Even with Kylin streaming support, we will likely still need our own way to get the live data into the ORC-backed Hive table. Flume's Hive Sink offers a way to directly push data from Flume into a ORC table. Currently marked as a preview feature, I ran into memory growth and instability issues in my attempts to use it, but if this proves stable in the future, it could be a much easier path to populating the ORC tables directly and get rid of the need for temporary JSON (along with the edge conditions those bring). -- [Kylin Hour Partitioning](https://issues.apache.org/jira/browse/KYLIN-1427): A possible shorter-term improvement while waiting for Kylin streaming is the ability to refresh Kylin by hour partitions. This would be more efficient than our full day refreshes currently used. This is currently implemented in v1.5.0, but we first need to upgrade to 1.5 (we're holding back at 1.2 due to some other issues), and then [KYLIN-1513](https://issues.apache.org/jira/browse/KYLIN-1513) would be good to get fixed before. - -### Querying - -The analytic APIs in the web application query Kylin or [PrestoDB](https://prestodb.io) using SQL statements: - -``` - /==> [Kylin] ====> [HBase Aggregates] - / -[api-umbrella-web-app] ===> - SQL \ - \==> [PrestoDB] => [Hive ORC Tables] -``` - -- Queries are attempted against Kylin first, since Kylin will provide the fastest answers from it's pre-computed aggregates. - - Kylin will be unable to answer the query if the query involves dimensions that have not been pre-computed. - - We've attempted to design the Kylin cubes with the dimensions that are involved in the most common queries. These are currently: - - `timestamp_tz_year` - - `timestamp_tz_month` - - `timestamp_tz_week` - - `timestamp_tz_date` - - `timestamp_tz_hour` - - `request_url_host` - - `request_url_path_level1` - - `request_url_path_level2` - - `request_url_path_level3` - - `request_url_path_level4` - - `request_url_path_level5` - - `request_url_path_level6` - - `user_id` - - `request_ip` - - `response_status` - - `denied_reason` - - `request_method` - - `request_ip_country` - - `request_ip_region` - - `request_ip_city` - - We don't add all the columns/dimensions to the Kylin cubes, since each additional dimension exponentially increases the amount of data Kylin has to pre-compute (which can significantly increase processing time and storage). - - Data must be processed into Kylin for it to be part of Kylin's results, so the results will typically lag 30-60 minutes behind live data. -- If Kylin fails for any reason (the query involves a column we haven't precomputed or Kylin is down), then we perform the same query against PrestoDB. This queries the underlying ORC tables stored in Hive (which is the same raw data Kylin bases its data cubes on). - - PrestoDB is used to provide an ANSI SQL layer on top of Hive. This should provide better compatibility with the SQL queries we're sending to Kylin, since both Kylin and PrestoDB aim for ANSI SQL compatibility (unlike Hive, which uses a different SQL-like HiveQL). - - PrestoDB also offers better performance (significant in some cases) for our SQL queries rather than querying Hive directly. PrestoDB has also been fairly optimized for querying ORC tables. - - Queries hitting PrestoDB will be slower than Kylin-answered queries. Query times vary primary depending on how much data is being queried, but response times may range from 5 seconds to multiple minutes. - - Data must be processed into the ORC-backed Hive table for it to be part of PrestoDB's results, so results will typically lag 2-3 minutes behind live data (and therefore differ from Kylin results). - - *TODO: Currently there's a 60 second timeout on PrestoDB queries to prevent long-running queries from piling up and hogging resources. However, if we find that people need to run longer-running queries, we can adjust this. We'll also need to adjust the default [60 second proxy timeouts](https://github.com/NREL/api-umbrella/blob/v0.11.0/config/default.yml#L21-L23).* diff --git a/docs/index.rst b/docs/index.rst index 1706794f6..1cc50e897 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -47,7 +47,6 @@ API Umbrella is an open source API management platform for exposing web service server/listen-ports server/logs server/db-config - server/analytics-storage-adapters .. toctree:: :caption: For API Consumers diff --git a/docs/server/analytics-storage-adapters.md b/docs/server/analytics-storage-adapters.md deleted file mode 100644 index a724811d7..000000000 --- a/docs/server/analytics-storage-adapters.md +++ /dev/null @@ -1,37 +0,0 @@ -# Analytics Storage Adapters - -API Umbrella can store analytics data in different types of databases, depending on your performance needs and volume of metrics. The adapter can be picked by setting the `analytics.adapter` option inside the `/etc/api-umbrella/api-umbrella.yml` configuration file. - -## Elasticsearch - -`analytics.adapter: elasticsearch` - -- Currently the default analytics store. -- Suitable for small to medium amounts of historical analytics data. *(TODO: Provide more definitive guidance on what small/medium/large amounts are)* -- API Umbrella ships with with a default ElasticSearch database that can be used or any ElasticSearch 1.7+ cluster can be used. - -## Kylin - -`analytics.adapter: kylin` - -- Suitable for large amounts of historical analytics data. *(TODO: Provide more definitive guidance on what small/medium/large amounts are)* -- Requires a functional Hadoop environment compatible with Kylin 1.2 ([Hortonworks Data Platform (HDP) 2.2](http://hortonworks.com/products/releases/hdp-2-2/) is recommended). -- Requires Kafka (can be enabled as part of HDP). -- Requires the optional `api-umbrella-hadoop-analytics` package to be installed on the analytics database server. *(TODO: Link to hadoop-analytics package downloads once built)* - -## PostgreSQL - -`analytics.adapter: postgresql` - -**TODO: The PostgreSQL adapter doesn't currently exist, but the idea is to leverage the same SQL framework built for Kylin.** - -- Suitable for small amounts of historical analytics data. *(TODO: Provide more definitive guidance on what small/medium/large amounts are)* - -### PostgreSQL: Columnar Storage - -- Suitable for small to medium amounts of historical analytics data. *(TODO: Provide more definitive guidance on what small/medium/large amounts are)* - -For better analytics performance with larger volumes of analytics data, you can continue to use the PostgreSQL adapter, but with a compatible column-based variant: - -- [cstore_fdw](https://github.com/citusdata/cstore_fdw) -- [Amazon Redshift](https://aws.amazon.com/redshift/) diff --git a/scripts/rake/lint.rake b/scripts/rake/lint.rake index d89eeb815..ecf2e055e 100644 --- a/scripts/rake/lint.rake +++ b/scripts/rake/lint.rake @@ -52,7 +52,6 @@ namespace :lint do # Ignore certain vendored files for linting. ignore_files = [ "configure", - "templates/etc/perp/kylin/rc.run.mustache", ] ["sh", "bash"].each do |shell| diff --git a/src/api-umbrella/admin-ui/app/controllers/stats/base.js b/src/api-umbrella/admin-ui/app/controllers/stats/base.js index 7aeac1063..746c64911 100644 --- a/src/api-umbrella/admin-ui/app/controllers/stats/base.js +++ b/src/api-umbrella/admin-ui/app/controllers/stats/base.js @@ -19,5 +19,4 @@ export default Controller.extend({ value: null, }], }), - beta_analytics: false, }); diff --git a/src/api-umbrella/admin-ui/app/controllers/stats/drilldown.js b/src/api-umbrella/admin-ui/app/controllers/stats/drilldown.js index 75fb56636..b6d1221ce 100644 --- a/src/api-umbrella/admin-ui/app/controllers/stats/drilldown.js +++ b/src/api-umbrella/admin-ui/app/controllers/stats/drilldown.js @@ -8,6 +8,5 @@ export default Base.extend({ 'query', 'search', 'prefix', - 'beta_analytics', ], }); diff --git a/src/api-umbrella/admin-ui/app/controllers/stats/logs.js b/src/api-umbrella/admin-ui/app/controllers/stats/logs.js index 7b618de61..9fb48b6d2 100644 --- a/src/api-umbrella/admin-ui/app/controllers/stats/logs.js +++ b/src/api-umbrella/admin-ui/app/controllers/stats/logs.js @@ -8,6 +8,5 @@ export default Base.extend({ 'interval', 'query', 'search', - 'beta_analytics', ], }); diff --git a/src/api-umbrella/admin-ui/app/controllers/stats/map.js b/src/api-umbrella/admin-ui/app/controllers/stats/map.js index 44b0aaa62..1a41c0570 100644 --- a/src/api-umbrella/admin-ui/app/controllers/stats/map.js +++ b/src/api-umbrella/admin-ui/app/controllers/stats/map.js @@ -7,6 +7,5 @@ export default Base.extend({ 'query', 'search', 'region', - 'beta_analytics', ], }); diff --git a/src/api-umbrella/admin-ui/app/controllers/stats/users.js b/src/api-umbrella/admin-ui/app/controllers/stats/users.js index 9edc03586..f399b3f39 100644 --- a/src/api-umbrella/admin-ui/app/controllers/stats/users.js +++ b/src/api-umbrella/admin-ui/app/controllers/stats/users.js @@ -6,6 +6,5 @@ export default Base.extend({ 'end_at', 'query', 'search', - 'beta_analytics', ], }); diff --git a/src/api-umbrella/admin-ui/app/routes/stats/drilldown.js b/src/api-umbrella/admin-ui/app/routes/stats/drilldown.js index 8ee8bf81f..d3966f947 100644 --- a/src/api-umbrella/admin-ui/app/routes/stats/drilldown.js +++ b/src/api-umbrella/admin-ui/app/routes/stats/drilldown.js @@ -24,9 +24,6 @@ export default Base.extend({ prefix: { refreshModel: true, }, - beta_analytics: { - refreshModel: true, - }, }, model() { diff --git a/src/api-umbrella/admin-ui/app/routes/stats/logs.js b/src/api-umbrella/admin-ui/app/routes/stats/logs.js index 1d0bc0422..7909deda9 100644 --- a/src/api-umbrella/admin-ui/app/routes/stats/logs.js +++ b/src/api-umbrella/admin-ui/app/routes/stats/logs.js @@ -21,9 +21,6 @@ export default Base.extend({ search: { refreshModel: true, }, - beta_analytics: { - refreshModel: true, - }, }, model() { diff --git a/src/api-umbrella/admin-ui/app/routes/stats/map.js b/src/api-umbrella/admin-ui/app/routes/stats/map.js index 2cbbd920f..6e7e4bb24 100644 --- a/src/api-umbrella/admin-ui/app/routes/stats/map.js +++ b/src/api-umbrella/admin-ui/app/routes/stats/map.js @@ -21,9 +21,6 @@ export default Base.extend({ region: { refreshModel: true, }, - beta_analytics: { - refreshModel: true, - }, }, model() { diff --git a/src/api-umbrella/admin-ui/app/routes/stats/users.js b/src/api-umbrella/admin-ui/app/routes/stats/users.js index e510e3c47..b363f27d7 100644 --- a/src/api-umbrella/admin-ui/app/routes/stats/users.js +++ b/src/api-umbrella/admin-ui/app/routes/stats/users.js @@ -17,9 +17,6 @@ export default Base.extend({ search: { refreshModel: true, }, - beta_analytics: { - refreshModel: true, - }, }, model() { diff --git a/src/api-umbrella/admin-ui/app/templates/components/stats/query-form.hbs b/src/api-umbrella/admin-ui/app/templates/components/stats/query-form.hbs index 0faf06903..9176fb6fb 100644 --- a/src/api-umbrella/admin-ui/app/templates/components/stats/query-form.hbs +++ b/src/api-umbrella/admin-ui/app/templates/components/stats/query-form.hbs @@ -4,10 +4,6 @@ {{t "admin.stats.filter_results"}}
- {{#if session.data.authenticated.enable_beta_analytics}} - - {{/if}} - {{#if enableInterval}} {{#bs-button-group value=interval onChange=(action (mut interval)) type="radio" size="xs" id="interval_buttons" as |bg|}} {{#bg.button value="minute"}}{{t "admin.stats.minute"}}{{/bg.button}} diff --git a/src/api-umbrella/admin-ui/app/templates/stats/drilldown.hbs b/src/api-umbrella/admin-ui/app/templates/stats/drilldown.hbs index 342c6874f..a6fe59169 100644 --- a/src/api-umbrella/admin-ui/app/templates/stats/drilldown.hbs +++ b/src/api-umbrella/admin-ui/app/templates/stats/drilldown.hbs @@ -1,4 +1,4 @@ -{{stats/query-form enableInterval=true date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval beta_analytics=beta_analytics allQueryParamValues=allQueryParamValues dateRanges=dateRanges}} +{{stats/query-form enableInterval=true date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval allQueryParamValues=allQueryParamValues dateRanges=dateRanges}} {{stats/drilldown/results-chart hitsOverTime=model.hits_over_time}} {{stats/drilldown/results-breadcrumbs breadcrumbs=model.breadcrumbs}} {{stats/drilldown/results-table results=model.results presentQueryParamValues=presentQueryParamValues backendQueryParamValues=backendQueryParamValues}} diff --git a/src/api-umbrella/admin-ui/app/templates/stats/logs.hbs b/src/api-umbrella/admin-ui/app/templates/stats/logs.hbs index e96aa5926..5c6c791af 100644 --- a/src/api-umbrella/admin-ui/app/templates/stats/logs.hbs +++ b/src/api-umbrella/admin-ui/app/templates/stats/logs.hbs @@ -1,4 +1,4 @@ -{{stats/query-form enableInterval=true date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval beta_analytics=beta_analytics allQueryParamValues=allQueryParamValues dateRanges=dateRanges}} +{{stats/query-form enableInterval=true date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval allQueryParamValues=allQueryParamValues dateRanges=dateRanges}} {{stats/logs/results-chart hitsOverTime=model.hits_over_time}} {{stats/logs/results-highlights stats=model.stats aggregations=model.aggregations presentQueryParamValues=presentQueryParamValues}} {{stats/logs/results-table presentQueryParamValues=presentQueryParamValues backendQueryParamValues=backendQueryParamValues}} diff --git a/src/api-umbrella/admin-ui/app/templates/stats/map.hbs b/src/api-umbrella/admin-ui/app/templates/stats/map.hbs index f34d0dc1c..38255aac3 100644 --- a/src/api-umbrella/admin-ui/app/templates/stats/map.hbs +++ b/src/api-umbrella/admin-ui/app/templates/stats/map.hbs @@ -1,4 +1,4 @@ -{{stats/query-form enableInterval=false date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval beta_analytics=beta_analytics allQueryParamValues=allQueryParamValues dateRanges=dateRanges}} +{{stats/query-form enableInterval=false date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval allQueryParamValues=allQueryParamValues dateRanges=dateRanges}}
{{stats/map/results-breadcrumbs breadcrumbs=model.map_breadcrumbs}} {{stats/map/results-map regions=model.map_regions regionField=model.region_field presentQueryParamValues=presentQueryParamValues allQueryParamValues=allQueryParamValues}} diff --git a/src/api-umbrella/admin-ui/app/templates/stats/users.hbs b/src/api-umbrella/admin-ui/app/templates/stats/users.hbs index 7c5e242ad..8eaf24cfc 100644 --- a/src/api-umbrella/admin-ui/app/templates/stats/users.hbs +++ b/src/api-umbrella/admin-ui/app/templates/stats/users.hbs @@ -1,2 +1,2 @@ -{{stats/query-form enableInterval=false date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval beta_analytics=beta_analytics allQueryParamValues=allQueryParamValues dateRanges=dateRanges}} +{{stats/query-form enableInterval=false date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval allQueryParamValues=allQueryParamValues dateRanges=dateRanges}} {{stats/users/results-table presentQueryParamValues=presentQueryParamValues backendQueryParamValues=backendQueryParamValues}} diff --git a/src/api-umbrella/cli/read_config.lua b/src/api-umbrella/cli/read_config.lua index 2f14856d8..1cf6309e9 100644 --- a/src/api-umbrella/cli/read_config.lua +++ b/src/api-umbrella/cli/read_config.lua @@ -296,12 +296,6 @@ local function set_computed_config() config["analytics"]["outputs"] = { config["analytics"]["adapter"] } end - config["kafka"]["_rsyslog_broker"] = {} - for _, broker in ipairs(config["kafka"]["brokers"]) do - table.insert(config["kafka"]["_rsyslog_broker"], '"' .. broker["host"] .. ":" .. broker["port"] .. '"') - end - config["kafka"]["_rsyslog_broker"] = table.concat(config["kafka"]["_rsyslog_broker"], ",") - -- Setup the request/response timeouts for the different pieces of the stack. -- Since we traverse multiple proxies, we want to make sure the timeouts of -- the different proxies are kept in sync. @@ -337,7 +331,6 @@ local function set_computed_config() ["_development_env?"] = (config["app_env"] == "development"), analytics = { ["_output_elasticsearch?"] = array_includes(config["analytics"]["outputs"], "elasticsearch"), - ["_output_kylin?"] = array_includes(config["analytics"]["outputs"], "kylin"), }, mongodb = { _database = plutils.split(array_last(plutils.split(config["mongodb"]["url"], "/", true)), "?", true)[1], @@ -361,7 +354,6 @@ local function set_computed_config() }, ["_service_general_db_enabled?"] = array_includes(config["services"], "general_db"), ["_service_log_db_enabled?"] = array_includes(config["services"], "log_db"), - ["_service_hadoop_db_enabled?"] = array_includes(config["services"], "hadoop_db"), ["_service_router_enabled?"] = array_includes(config["services"], "router"), ["_service_web_enabled?"] = array_includes(config["services"], "web"), ["_service_nginx_reloader_enabled?"] = (array_includes(config["services"], "router") and config["nginx"]["_reloader_frequency"]), diff --git a/src/api-umbrella/hadoop-analytics/.gitignore b/src/api-umbrella/hadoop-analytics/.gitignore deleted file mode 100644 index fc81c16f9..000000000 --- a/src/api-umbrella/hadoop-analytics/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -/.classpath -/.project -/.settings -/bin/ -/dependency-reduced-pom.xml -/target/ diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/.gitignore b/src/api-umbrella/hadoop-analytics/elasticsearch-import/.gitignore deleted file mode 100644 index c0ca0f21c..000000000 --- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -/.classpath -/.project -/.settings -/dependency-reduced-pom.xml -/migrate.log -/target/ diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/README.md b/src/api-umbrella/hadoop-analytics/elasticsearch-import/README.md deleted file mode 100644 index ed3b6ce96..000000000 --- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/README.md +++ /dev/null @@ -1,443 +0,0 @@ -``` -# Import from ElasticSearch to HDFS ORC files. -$ cd hadoop-analytics -$ mvn clean package -$ sudo -u hive java -Dapiumbrella.page_size=10000 -Dapiumbrella.elasticsearch_url="http://ELASTICSEARCH_HOST:9200" -Dapiumbrella.hdfs_uri="hdfs://HDFS_HOST:8020" -Dapiumbrella.timezone=TIMEZONE -jar elasticsearch-import/target/elasticsearch-import-0.0.1-SNAPSHOT.jar - -# Create the Hive table. -$ sudo -u hive hive -hive> CREATE DATABASE api_umbrella; -hive> CREATE EXTERNAL TABLE api_umbrella.logs(timestamp_utc BIGINT, id STRING, timestamp_tz_offset INT, timestamp_tz_hour STRING, timestamp_tz_minute STRING, user_id STRING, denied_reason STRING, request_method STRING, request_url_scheme STRING, request_url_host STRING, request_url_port INT, request_url_path STRING, request_url_path_level1 STRING, request_url_path_level2 STRING, request_url_path_level3 STRING, request_url_path_level4 STRING, request_url_path_level5 STRING, request_url_path_level6 STRING, request_url_query STRING, request_ip STRING, request_ip_country STRING, request_ip_region STRING, request_ip_city STRING, request_ip_lat DOUBLE, request_ip_lon DOUBLE, request_user_agent STRING, request_user_agent_type STRING, request_user_agent_family STRING, request_size INT, request_accept STRING, request_accept_encoding STRING, request_content_type STRING, request_connection STRING, request_origin STRING, request_referer STRING, request_basic_auth_username STRING, response_status SMALLINT, response_content_type STRING, response_content_length INT, response_content_encoding STRING, response_transfer_encoding STRING, response_server STRING, response_cache STRING, response_age INT, response_size INT, timer_response DOUBLE, timer_backend_response DOUBLE, timer_internal DOUBLE, timer_proxy_overhead DOUBLE, log_imported BOOLEAN) PARTITIONED BY (timestamp_tz_year DATE, timestamp_tz_month DATE, timestamp_tz_week DATE, timestamp_tz_date DATE) STORED AS ORC LOCATION '/apps/api-umbrella/logs'; -hive> exit; - -# Create the Kylin project and define the table data source. -$ curl 'http://ADMIN:KYLIN@localhost:7070/kylin/api/projects' -H 'Content-Type: application/json;charset=UTF-8' --data-binary '{"name":"api_umbrella","description":""}' -$ curl 'http://ADMIN:KYLIN@localhost:7070/kylin/api/tables/api_umbrella.logs/api_umbrella' -H 'Content-Type: application/json;charset=UTF-8' --data-binary '{}' - -# WAIT... Wait for the Kylin cardinality job to complete before proceeding -# (progress is viewable in the kylin.log file). If you define the table source -# after importing all the partitions, then the cardinality job will take forever -# in Kylin. Since we don't actaully need the cardinality caclculations, we'll -# just ensure they run against an empty table. -# -# If this issue gets addressed, then we would have more control over the -# cardinality job, and wouldn't have to wait: -# https://issues.apache.org/jira/browse/KYLIN-1407 - -# Add all the daily partitions to the Hive table. -$ sudo -u hive hadoop fs -ls -R /apps/api-umbrella/logs | grep -E "(\.orc|000000_0)$" | grep -o "[^ ]*$" | sort -V | sed -e "s/.*timestamp_tz_year=\([0-9\-]\+\).*timestamp_tz_month=\([0-9\-]\+\).*timestamp_tz_week=\([0-9\-]\+\).*timestamp_tz_date=\([0-9\-]\+\).*/ALTER TABLE api_umbrella.logs ADD IF NOT EXISTS PARTITION(timestamp_tz_year='\1', timestamp_tz_month='\2', timestamp_tz_week='\3', timestamp_tz_date='\4');/" | uniq > /tmp/api_umbrella_load_partitions.sql -$ sudo -u hive hive -f /tmp/api_umbrella_load_partitions.sql - -# Create the model. -$ echo '{"modelDescData":"{ - \"name\": \"logs_model\", - \"description\": \"\", - \"fact_table\": \"API_UMBRELLA.LOGS\", - \"lookups\": [], - \"filter_condition\": \"\", - \"capacity\": \"MEDIUM\", - \"dimensions\": [ - { - \"table\": \"API_UMBRELLA.LOGS\", - \"columns\": [ - \"TIMESTAMP_TZ_YEAR\", - \"TIMESTAMP_TZ_MONTH\", - \"TIMESTAMP_TZ_WEEK\", - \"TIMESTAMP_TZ_DATE\", - \"REQUEST_URL_HOST\", - \"REQUEST_URL_PATH_LEVEL1\", - \"REQUEST_URL_PATH_LEVEL2\", - \"REQUEST_URL_PATH_LEVEL3\", - \"REQUEST_URL_PATH_LEVEL4\", - \"REQUEST_URL_PATH_LEVEL5\", - \"REQUEST_URL_PATH_LEVEL6\", - \"USER_ID\", - \"REQUEST_IP\", - \"RESPONSE_STATUS\", - \"DENIED_REASON\", - \"REQUEST_METHOD\", - \"REQUEST_IP_COUNTRY\", - \"REQUEST_IP_REGION\", - \"REQUEST_IP_CITY\" - ] - } - ], - \"metrics\": [ - \"USER_ID\", - \"REQUEST_IP\", - \"TIMER_RESPONSE\", - \"TIMESTAMP_UTC\" - ], - \"partition_desc\": { - \"partition_date_column\": \"API_UMBRELLA.LOGS.TIMESTAMP_TZ_DATE\", - \"partition_date_format\": \"yyyy-MM-dd\", - \"partition_date_start\": null, - \"partition_type\": \"APPEND\" - }, - \"last_modified\": 0 -}","project":"api_umbrella"}' | perl -p -e 's/\n/\\n/' | curl -v -XPOST -H "Content-Type: application/json;charset=UTF-8" --data-binary @- "http://ADMIN:KYLIN@localhost:7070/kylin/api/models" - -# Create the cube. -$ echo '{"cubeDescData":"{ - \"name\": \"logs_cube\", - \"model_name\": \"logs_model\", - \"description\": \"\", - \"dimensions\": [ - { - \"name\": \"API_UMBRELLA.LOGS.USER_ID\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"USER_ID\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.DENIED_REASON\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"DENIED_REASON\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.REQUEST_METHOD\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"REQUEST_METHOD\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_HOST\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"REQUEST_URL_HOST\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_PATH_LEVEL1\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"REQUEST_URL_PATH_LEVEL1\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_PATH_LEVEL2\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"REQUEST_URL_PATH_LEVEL2\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_PATH_LEVEL3\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"REQUEST_URL_PATH_LEVEL3\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_PATH_LEVEL4\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"REQUEST_URL_PATH_LEVEL4\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_PATH_LEVEL5\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"REQUEST_URL_PATH_LEVEL5\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_PATH_LEVEL6\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"REQUEST_URL_PATH_LEVEL6\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.REQUEST_IP\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"REQUEST_IP\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.REQUEST_IP_COUNTRY\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"REQUEST_IP_COUNTRY\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.REQUEST_IP_REGION\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"REQUEST_IP_REGION\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.REQUEST_IP_CITY\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"REQUEST_IP_CITY\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.RESPONSE_STATUS\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"RESPONSE_STATUS\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.TIMESTAMP_TZ_YEAR\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"TIMESTAMP_TZ_YEAR\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.TIMESTAMP_TZ_MONTH\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"TIMESTAMP_TZ_MONTH\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.TIMESTAMP_TZ_WEEK\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"TIMESTAMP_TZ_WEEK\", - \"derived\": null - }, - { - \"name\": \"API_UMBRELLA.LOGS.TIMESTAMP_TZ_DATE\", - \"table\": \"API_UMBRELLA.LOGS\", - \"column\": \"TIMESTAMP_TZ_DATE\", - \"derived\": null - } - ], - \"measures\": [ - { - \"name\": \"_COUNT_\", - \"function\": { - \"expression\": \"COUNT\", - \"parameter\": { - \"type\": \"constant\", - \"value\": \"1\", - \"next_parameter\": null - }, - \"returntype\": \"bigint\" - }, - \"dependent_measure_ref\": null - }, - { - \"name\": \"COUNT_DISTINCT_USER_ID\", - \"function\": { - \"expression\": \"COUNT_DISTINCT\", - \"parameter\": { - \"type\": \"column\", - \"value\": \"USER_ID\", - \"next_parameter\": null - }, - \"returntype\": \"hllc12\" - }, - \"dependent_measure_ref\": null - }, - { - \"name\": \"COUNT_DISTINCT_REQUEST_IP\", - \"function\": { - \"expression\": \"COUNT_DISTINCT\", - \"parameter\": { - \"type\": \"column\", - \"value\": \"REQUEST_IP\", - \"next_parameter\": null - }, - \"returntype\": \"hllc12\" - }, - \"dependent_measure_ref\": null - }, - { - \"name\": \"SUM_TIMER_RESPONSE\", - \"function\": { - \"expression\": \"SUM\", - \"parameter\": { - \"type\": \"column\", - \"value\": \"TIMER_RESPONSE\", - \"next_parameter\": null - }, - \"returntype\": \"decimal\" - }, - \"dependent_measure_ref\": null - }, - { - \"name\": \"MAX_TIMESTAMP_UTC\", - \"function\": { - \"expression\": \"MAX\", - \"parameter\": { - \"type\": \"column\", - \"value\": \"TIMESTAMP_UTC\", - \"next_parameter\": null - }, - \"returntype\": \"bigint\" - }, - \"dependent_measure_ref\": null - } - ], - \"rowkey\": { - \"rowkey_columns\": [ - { - \"column\": \"TIMESTAMP_TZ_YEAR\", - \"encoding\": \"dict\" - }, - { - \"column\": \"TIMESTAMP_TZ_MONTH\", - \"encoding\": \"dict\" - }, - { - \"column\": \"TIMESTAMP_TZ_WEEK\", - \"encoding\": \"dict\" - }, - { - \"column\": \"TIMESTAMP_TZ_DATE\", - \"encoding\": \"dict\" - }, - { - \"column\": \"REQUEST_URL_HOST\", - \"encoding\": \"dict\" - }, - { - \"column\": \"REQUEST_URL_PATH_LEVEL1\", - \"encoding\": \"fixed_length:40\" - }, - { - \"column\": \"DENIED_REASON\", - \"encoding\": \"dict\" - }, - { - \"column\": \"USER_ID\", - \"encoding\": \"fixed_length:36\" - }, - { - \"column\": \"REQUEST_IP\", - \"encoding\": \"fixed_length:45\" - }, - { - \"column\": \"REQUEST_URL_PATH_LEVEL2\", - \"encoding\": \"fixed_length:40\" - }, - { - \"column\": \"REQUEST_URL_PATH_LEVEL3\", - \"encoding\": \"fixed_length:40\" - }, - { - \"column\": \"REQUEST_URL_PATH_LEVEL4\", - \"encoding\": \"fixed_length:40\" - }, - { - \"column\": \"REQUEST_URL_PATH_LEVEL5\", - \"encoding\": \"fixed_length:40\" - }, - { - \"column\": \"REQUEST_URL_PATH_LEVEL6\", - \"encoding\": \"fixed_length:40\" - }, - { - \"column\": \"REQUEST_IP_COUNTRY\", - \"encoding\": \"dict\" - }, - { - \"column\": \"REQUEST_IP_REGION\", - \"encoding\": \"dict\" - }, - { - \"column\": \"REQUEST_IP_CITY\", - \"encoding\": \"dict\" - }, - { - \"column\": \"RESPONSE_STATUS\", - \"encoding\": \"dict\" - }, - { - \"column\": \"REQUEST_METHOD\", - \"encoding\": \"dict\" - } - ] - }, - \"hbase_mapping\": { - \"column_family\": [ - { - \"name\": \"f1\", - \"columns\": [ - { - \"qualifier\": \"m\", - \"measure_refs\": [ - \"_COUNT_\", - \"SUM_TIMER_RESPONSE\", - \"MAX_TIMESTAMP_UTC\" - ] - } - ] - }, - { - \"name\": \"f2\", - \"columns\": [ - { - \"qualifier\": \"m\", - \"measure_refs\": [ - \"COUNT_DISTINCT_USER_ID\", - \"COUNT_DISTINCT_REQUEST_IP\" - ] - } - ] - } - ] - }, - \"aggregation_groups\": [ - { - \"includes\": [ - \"TIMESTAMP_TZ_YEAR\", - \"TIMESTAMP_TZ_MONTH\", - \"TIMESTAMP_TZ_WEEK\", - \"TIMESTAMP_TZ_DATE\", - \"REQUEST_URL_HOST\", - \"REQUEST_URL_PATH_LEVEL1\", - \"REQUEST_URL_PATH_LEVEL2\", - \"REQUEST_URL_PATH_LEVEL3\", - \"REQUEST_URL_PATH_LEVEL4\", - \"REQUEST_URL_PATH_LEVEL5\", - \"REQUEST_URL_PATH_LEVEL6\", - \"USER_ID\", - \"REQUEST_IP\", - \"RESPONSE_STATUS\", - \"DENIED_REASON\", - \"REQUEST_METHOD\", - \"REQUEST_IP_COUNTRY\", - \"REQUEST_IP_REGION\", - \"REQUEST_IP_CITY\" - ], - \"select_rule\": { - \"hierarchy_dims\": [ - [ - \"REQUEST_URL_HOST\", - \"REQUEST_URL_PATH_LEVEL1\", - \"REQUEST_URL_PATH_LEVEL2\", - \"REQUEST_URL_PATH_LEVEL3\", - \"REQUEST_URL_PATH_LEVEL4\", - \"REQUEST_URL_PATH_LEVEL5\", - \"REQUEST_URL_PATH_LEVEL6\" - ], - [ - \"REQUEST_IP_COUNTRY\", - \"REQUEST_IP_REGION\", - \"REQUEST_IP_CITY\" - ] - ], - \"mandatory_dims\": [ - \"TIMESTAMP_TZ_YEAR\", - \"TIMESTAMP_TZ_MONTH\", - \"TIMESTAMP_TZ_WEEK\", - \"TIMESTAMP_TZ_DATE\" - ], - \"joint_dims\": [ - [ - \"USER_ID\", - \"REQUEST_IP\" - ] - ] - } - } - ], - \"notify_list\": [], - \"status_need_notify\": [], - \"partition_date_start\": 1281916800000, - \"auto_merge_time_ranges\": [ - 604800000, - 2419200000 - ], - \"retention_range\": 0, - \"engine_type\": 2, - \"storage_type\": 2 -}","cubeName":"logs_cube","project":"api_umbrella","streamingCube":false}' | perl -p -e 's/\n/\\n/' | curl -v -XPOST -H "Content-Type: application/json;charset=UTF-8" --data-binary @- "http://ADMIN:KYLIN@localhost:7070/kylin/api/cubes" -``` diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/logging.properties b/src/api-umbrella/hadoop-analytics/elasticsearch-import/logging.properties deleted file mode 100644 index d8c1d20e4..000000000 --- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/logging.properties +++ /dev/null @@ -1,2 +0,0 @@ -org.apache.parquet.handlers=java.util.logging.ConsoleHandler -java.util.logging.ConsoleHandler.level=SEVERE diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/pom.xml b/src/api-umbrella/hadoop-analytics/elasticsearch-import/pom.xml deleted file mode 100644 index 65221aecd..000000000 --- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/pom.xml +++ /dev/null @@ -1,114 +0,0 @@ - - 4.0.0 - - - api-umbrella.hadoop-analytics - hadoop-analytics - 0.0.1-SNAPSHOT - - elasticsearch-import - jar - - elasticsearch-import - http://maven.apache.org - - - UTF-8 - 1.7 - 1.7 - - - - - - api-umbrella.hadoop-analytics - schema - 0.0.1-SNAPSHOT - - - - io.searchbox - jest - 2.0.0 - - - - - joda-time - joda-time - 2.9.2 - - - - org.slf4j - slf4j-simple - 1.7.16 - - - - org.apache.hive - hive-exec - 0.14.0 - - - - org.apache.hadoop - hadoop-client - 2.6.0 - - - - org.apache.calcite - calcite-core - 0.9.2-incubating - - - org.apache.calcite - calcite-avatica - 0.9.2-incubating - - - - ${buildDir}/elasticsearch-import - - - - org.apache.maven.plugins - maven-shade-plugin - 2.4.3 - - - package - - shade - - - - - apiumbrella.hadoop_analytics.App - - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - - - - diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/App.java b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/App.java deleted file mode 100644 index acacf1d51..000000000 --- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/App.java +++ /dev/null @@ -1,145 +0,0 @@ -package apiumbrella.hadoop_analytics; - -import java.io.IOException; -import java.math.BigInteger; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Map.Entry; -import java.util.Set; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.joda.time.Period; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.ISODateTimeFormat; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.gson.JsonElement; - -import io.searchbox.client.JestClient; -import io.searchbox.client.JestClientFactory; -import io.searchbox.client.JestResult; -import io.searchbox.client.config.HttpClientConfig; -import io.searchbox.indices.aliases.GetAliases; - -public class App { - protected static final String HDFS_URI = - System.getProperty("apiumbrella.hdfs_uri", "hdfs://127.0.0.1:8020"); - protected static String DIR = System.getProperty("apiumbrella.dir", "/apps/api-umbrella/logs"); - protected static String ELASTICSEARCH_URL = - System.getProperty("apiumbrella.elasticsearch_url", "http://localhost:9200"); - protected static int PAGE_SIZE = - Integer.parseInt(System.getProperty("apiumbrella.page_size", "5000")); - protected static int CONCURRENCY = - Integer.parseInt(System.getProperty("apiumbrella.concurrency", "4")); - protected static String START_DATE = System.getProperty("apiumbrella.start_date"); - protected static String END_DATE = System.getProperty("apiumbrella.end_date"); - protected static DateTimeZone TIMEZONE = - DateTimeZone.forID(System.getProperty("apiumbrella.timezone", "UTC")); - - // Define fields we won't migrate to the new database. - protected static final Set SKIP_FIELDS = new HashSet(Arrays.asList(new String[] { - // These URL-related fields are handled specially when dealing with the - // request_url field. - "request_scheme", "request_host", "request_path", "request_query", - - // We're no longer storing the special hierarchy field. - "request_hierarchy", "request_path_hierarchy", - - // We're only storing the user_id, and not other fields that can be derived - // from it (lookups on these fields will need to first query the user table, - // and then perform queries base don user_id). - "user_registration_source", "user_email", "api_key", - - // Old timer field from the nodejs stack, that's not relevant anymore, and - // we don't need to migrate over (it was always somewhat duplicative). - "internal_response_time", - - // Junk field we've seen on some old data. - "_type"})); - - final Logger logger = LoggerFactory.getLogger(App.class); - private BigInteger globalHits = BigInteger.valueOf(0); - - public App() { - System.out.println("Logging to log/elasticsearch-import.log..."); - - ExecutorService executor = Executors.newFixedThreadPool(CONCURRENCY); - - DateTime date = this.getStartDate(); - DateTime endDate = this.getEndDate(); - while (date.isBefore(endDate)) { - Runnable worker = new DayWorker(this, date); - executor.execute(worker); - - date = date.plus(Period.days(1)); - } - - executor.shutdown(); - while (!executor.isTerminated()) { - } - logger.info("Finished all threads"); - } - - protected synchronized BigInteger incrementGlobalHits(Integer total) { - this.globalHits = this.globalHits.add(BigInteger.valueOf(total)); - return this.globalHits; - } - - private DateTime getStartDate() { - if (START_DATE != null) { - DateTimeFormatter dateParser = ISODateTimeFormat.dateParser().withZone(TIMEZONE); - return dateParser.parseDateTime(START_DATE); - } - - JestClientFactory factory = new JestClientFactory(); - factory.setHttpClientConfig( - new HttpClientConfig.Builder(ELASTICSEARCH_URL).multiThreaded(true).build()); - GetAliases aliases = new GetAliases.Builder().build(); - JestClient client = factory.getObject(); - DateTime first = null; - try { - JestResult result = client.execute(aliases); - if (!result.isSucceeded()) { - logger.error(result.getErrorMessage()); - System.exit(1); - } - - for (Entry entry : result.getJsonObject().entrySet()) { - Pattern pattern = Pattern.compile("^api-umbrella.*-([0-9]{4})-([0-9]{2})$"); - Matcher matches = pattern.matcher(entry.getKey()); - if (matches.find()) { - DateTime indexDate = - new DateTime(Integer.parseInt(matches.group(1)), Integer.parseInt(matches.group(2)), - 1, 0, 0, 0, DateTimeZone.UTC).withZone(TIMEZONE).withTime(0, 0, 0, 0); - if (first == null || indexDate.isBefore(first)) { - first = indexDate; - } - } - } - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); - } - - return first; - } - - private DateTime getEndDate() { - if (END_DATE != null) { - DateTimeFormatter dateParser = ISODateTimeFormat.dateParser().withZone(TIMEZONE); - return dateParser.parseDateTime(END_DATE); - } - - return new DateTime(); - } - - public static void main(String[] args) throws SecurityException, IOException { - new App(); - } -} diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/DayWorker.java b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/DayWorker.java deleted file mode 100644 index 702705595..000000000 --- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/DayWorker.java +++ /dev/null @@ -1,597 +0,0 @@ -package apiumbrella.hadoop_analytics; - -import java.io.IOException; -import java.math.BigInteger; -import java.net.MalformedURLException; -import java.net.URL; -import java.net.URLEncoder; -import java.nio.file.Paths; -import java.text.NumberFormat; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.Locale; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.commons.lang.StringUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.orc.CompressionKind; -import org.apache.hadoop.hive.ql.io.orc.OrcFile; -import org.apache.hadoop.hive.ql.io.orc.OrcFile.WriterOptions; -import org.apache.hadoop.hive.ql.io.orc.Writer; -import org.apache.hadoop.hive.serde2.io.DoubleWritable; -import org.apache.hadoop.hive.serde2.io.ShortWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.io.BooleanWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.joda.time.Period; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.ISODateTimeFormat; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import com.google.gson.JsonPrimitive; - -import io.searchbox.client.JestClient; -import io.searchbox.client.JestClientFactory; -import io.searchbox.client.JestResult; -import io.searchbox.client.config.HttpClientConfig; -import io.searchbox.core.Search; -import io.searchbox.core.SearchScroll; -import io.searchbox.params.Parameters; - -public class DayWorker implements Runnable { - final Logger logger = LoggerFactory.getLogger(DayWorker.class); - - private DateTime dayStartTime; - private DateTime dayEndTime; - private static LogSchema schema; - private App app; - private int totalProcessedHits = 0; - private int totalHits; - private static WriterOptions orcWriterOptions; - private Writer orcWriter; - DateTimeFormatter dateTimeParser = ISODateTimeFormat.dateTimeParser(); - DateTimeFormatter dateTimeFormatter = - DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(App.TIMEZONE); - DateTimeFormatter dateFormatter = ISODateTimeFormat.date().withZone(App.TIMEZONE); - - public DayWorker(App app, DateTime date) { - this.app = app; - dayStartTime = date; - dayEndTime = this.dayStartTime.plus(Period.days(1)); - schema = new LogSchema(); - } - - public void run() { - try { - JestClientFactory factory = new JestClientFactory(); - factory.setHttpClientConfig(new HttpClientConfig.Builder(App.ELASTICSEARCH_URL) - .multiThreaded(true).connTimeout(60000).readTimeout(120000).build()); - JestClient client = factory.getObject(); - - // Perform a scroll query to fetch the specified day's data from - // elasticsearch. - String query = "{" + // - " \"sort\":\"request_at\"," + // - " \"query\":{" + // - " \"filtered\":{" + // - " \"filter\":{" + // - " \"range\":{" + // - " \"request_at\":{" + // - " \"gte\":" + this.dayStartTime.getMillis() + "," + // - " \"lt\":" + this.dayEndTime.getMillis() + // - " }" + // - " }" + // - " }" + // - " }" + // - " }" + // - "}"; - // Query the indexes that cover this day (this may involve multiple indexes on month - // boundaries, if we're converting into a timezone, since the indexes are UTC-based). - String startTimeIndex = "api-umbrella-logs-" - + ISODateTimeFormat.yearMonth().withZone(DateTimeZone.UTC).print(dayStartTime); - String endTimeIndex = "api-umbrella-logs-" - + ISODateTimeFormat.yearMonth().withZone(DateTimeZone.UTC).print(dayEndTime); - Search search = new Search.Builder(query).ignoreUnavailable(true).addIndex(startTimeIndex) - .addIndex(endTimeIndex).setParameter(Parameters.SIZE, App.PAGE_SIZE) - .setParameter(Parameters.SCROLL, "3m").build(); - - JestResult result = client.execute(search); - if (!result.isSucceeded()) { - logger.error(result.getErrorMessage()); - System.exit(1); - } - - String scrollId = result.getJsonObject().get("_scroll_id").getAsString(); - this.totalHits = result.getJsonObject().getAsJsonObject("hits").get("total").getAsInt(); - - while (true) { - // Keep looping until the scroll result returns no results. - if (!processResult(result)) { - break; - } - - SearchScroll scroll = new SearchScroll.Builder(scrollId, "3m").build(); - result = client.execute(scroll); - if (!result.isSucceeded()) { - logger.error(result.getErrorMessage()); - System.exit(1); - } - - scrollId = result.getJsonObject().get("_scroll_id").getAsString(); - } - - // Close the data file (but only if it exists, so we skip over days - // with no data). - if (this.orcWriter != null) { - this.orcWriter.close(); - this.orcWriter = null; - } - } catch (Exception e) { - logger.error("Unexpected error", e); - System.exit(1); - } - } - - private static WriterOptions getOrcWriterOptions() throws IOException { - if (orcWriterOptions == null) { - ArrayList orcFields = new ArrayList(); - for (int i = 0; i < schema.getNonPartitionFieldsList().size(); i++) { - String field = schema.getNonPartitionFieldsList().get(i); - Schema.Type type = schema.getFieldType(field); - - ObjectInspector inspector; - if (type == Schema.Type.INT) { - if (schema.isFieldTypeShort(field)) { - inspector = PrimitiveObjectInspectorFactory.writableShortObjectInspector; - } else { - inspector = PrimitiveObjectInspectorFactory.writableIntObjectInspector; - } - } else if (type == Schema.Type.LONG) { - inspector = PrimitiveObjectInspectorFactory.writableLongObjectInspector; - } else if (type == Schema.Type.DOUBLE) { - inspector = PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; - } else if (type == Schema.Type.BOOLEAN) { - inspector = PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; - } else if (type == Schema.Type.STRING) { - inspector = PrimitiveObjectInspectorFactory.writableStringObjectInspector; - } else { - throw new IOException("Unknown type: " + type.toString()); - } - - orcFields.add(new OrcField(field, inspector, i)); - } - - Configuration conf = new Configuration(); - // Fix for hadoop jar ordering: http://stackoverflow.com/a/21118824 - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - - orcWriterOptions = OrcFile.writerOptions(conf); - orcWriterOptions.compress(CompressionKind.ZLIB); - orcWriterOptions.inspector(new OrcRowInspector(orcFields)); - } - - return orcWriterOptions; - } - - private Writer getOrcWriter() throws IOException { - if (this.orcWriter == null) { - String date = dateFormatter.print(dayStartTime); - // Create a new file in /dir/YYYY/MM/WW/YYYY-MM-DD.par - Path path = new Path(App.HDFS_URI + Paths.get(App.DIR, - "timestamp_tz_year=" + dateFormatter.print(dayStartTime.withDayOfYear(1)), - "timestamp_tz_month=" + dateFormatter.print(dayStartTime.withDayOfMonth(1)), - "timestamp_tz_week=" + dateFormatter.print(dayStartTime.withDayOfWeek(1)), - "timestamp_tz_date=" + date, date + ".orc")); - this.orcWriter = OrcFile.createWriter(path, getOrcWriterOptions()); - } - - return this.orcWriter; - } - - private boolean processResult(JestResult result) throws Exception { - JsonArray hits = result.getJsonObject().getAsJsonObject("hits").getAsJsonArray("hits"); - - int pageHits = hits.size(); - if (pageHits == 0) { - return false; - } - - this.totalProcessedHits += pageHits; - - BigInteger globalHits = this.app.incrementGlobalHits(pageHits); - NumberFormat numberFormatter = NumberFormat.getNumberInstance(Locale.US); - DateTime firstRequestAt = this.parseTimestamp( - hits.get(0).getAsJsonObject().get("_source").getAsJsonObject().get("request_at")); - logger.info(String.format("Processing %s to %s | %10s / %10s | %12s | %s", this.dayStartTime, - this.dayEndTime, numberFormatter.format(this.totalProcessedHits), - numberFormatter.format(this.totalHits), numberFormatter.format(globalHits), - firstRequestAt)); - - for (int i = 0; i < pageHits; i++) { - JsonObject hit = hits.get(i).getAsJsonObject(); - this.processHit(hit); - } - - return true; - } - - private void processHit(JsonObject hit) throws Exception { - JsonObject source = hit.get("_source").getAsJsonObject(); - - try { - // For each hit, create a new Avro record to serialize it into the new - // format for storage. - GenericRecord log = new GenericData.Record(schema.getSchema()); - log.put("id", hit.get("_id").getAsString()); - - // Loop over each attribute in the source data, assigning each value to - // the new data record. - for (Map.Entry entry : source.entrySet()) { - String key = entry.getKey(); - - // Skip this field if we've explicitly marked it as not migrating. - if (App.SKIP_FIELDS.contains(key)) { - continue; - } - - JsonElement value = entry.getValue(); - - // Skip setting anything if the value is null. - if (value == null || value.isJsonNull()) { - continue; - } - - // Handle special processing for certain fields. - switch (key) { - case "request_at": - // Split up the timestamp into several fields for better compatibility - // with the Kylin's cube's that will be created (which doesn't support - // timestamps yet). - DateTime requestAt = this.parseTimestamp(value); - log.put("timestamp_utc", requestAt.getMillis()); - log.put("timestamp_tz_offset", App.TIMEZONE.getOffset(requestAt.getMillis())); - log.put("timestamp_tz_year", dateFormatter.print(requestAt.withDayOfYear(1))); - log.put("timestamp_tz_month", dateFormatter.print(requestAt.withDayOfMonth(1))); - log.put("timestamp_tz_week", dateFormatter.print(requestAt.withDayOfWeek(1))); - log.put("timestamp_tz_date", dateFormatter.print(requestAt)); - log.put("timestamp_tz_hour", dateTimeFormatter - .print(requestAt.withMinuteOfHour(0).withSecondOfMinute(0).withMillisOfSecond(0))); - log.put("timestamp_tz_minute", - dateTimeFormatter.print(requestAt.withSecondOfMinute(0).withMillisOfSecond(0))); - value = null; - break; - case "request_ip_location": - // Flatten the location object into two separate fields. - log.put("request_ip_lat", value.getAsJsonObject().get("lat").getAsDouble()); - log.put("request_ip_lon", value.getAsJsonObject().get("lon").getAsDouble()); - value = null; - break; - case "request_url": - // Perform various cleanup and sanity checks on storing the URL as - // separate fields (versus the duplicative separate fields plus a full - // URL field). The full URL field sometimes differs in the data versus - // the individual fields, so we want to make sure we're transferring - // the best data possible and not losing anything in the process. - URL url; - try { - url = new URL(value.getAsString()); - } catch (MalformedURLException e) { - try { - // Cleanup some oddities in some invalid URLs seen (I think from - // localhost testing). - url = new URL(value.getAsString().replace(":80:80/", ":80/").replace("://[", "://") - .replace("]/", "/")); - } catch (MalformedURLException e2) { - logger.error(hit.toString()); - throw (e2); - } - } - - // Store the original request_scheme, since that seems to be more - // accurate than sometimes incorrect http:// urls on request_url that - // are actually https. - String requestScheme = source.get("request_scheme").getAsString(); - if (!url.getProtocol().equals(requestScheme)) { - logger.warn("request_url's scheme (" + url.getProtocol() - + ") does not match request_scheme (" + requestScheme + ")"); - } - log.put("request_url_scheme", requestScheme); - - // Store the host extracted from the full URL, since that seems more - // accurate than the separate request_host field (it seems to better - // handle some odd invalid hostnames, which probably don't actually - // matter too much). - String requestHost = source.get("request_host").getAsString().toLowerCase(); - String urlHost = url.getHost().toLowerCase(); - if (!urlHost.equals(requestHost)) { - logger.warn("request_url's host (" + url.getHost() + ") does not match request_host (" - + requestHost + ")"); - } - log.put("request_url_host", urlHost); - - // As a new field, store the port used. Most of the time this will be - // the default 80 or 443, depending on the scheme. - int urlPort = url.getPort(); - if (log.get("request_url_scheme").equals("https") && urlPort == 80) { - log.put("request_url_port", 443); - } else { - // If the port isn't set, or it's 50090, set it to the default port - // based on the scheme. We're ignoring port 50090, since this is - // present on some of our rather old imported logs, and was an - // internal-only port that was used (but was never public, so this - // isn't accurate). - if (urlPort == -1 || urlPort == 50090) { - if (log.get("request_url_scheme").equals("https")) { - log.put("request_url_port", 443); - } else { - log.put("request_url_port", 80); - } - } else { - log.put("request_url_port", urlPort); - } - } - - // Store the path extracted from the full URL, since it seems to be - // more accurate at dealing with odd URL encoding issues. - String requestPath = source.get("request_path").getAsString(); - if (!url.getPath().equals(requestPath)) { - // Before throwing a warning, ignore some semi-common URL encoding - // differences between the full URL and the request_path field - // (where we're comfortable with the encoding of the full URL's - // version). Also deal with missing hash fragment details. - String encodedUrlPath = url.getPath(); - if (url.getRef() != null && url.getQuery() == null) { - encodedUrlPath += "#" + url.getRef(); - } - encodedUrlPath = URLEncoder.encode(encodedUrlPath, "UTF-8"); - encodedUrlPath = encodedUrlPath.replace("%25", "%"); - - String encodedRequestPath = requestPath.replaceAll("/(x[0-9])", "\\\\$1"); - encodedRequestPath = URLEncoder.encode(encodedRequestPath, "UTF-8"); - encodedRequestPath = encodedRequestPath.replace("%25", "%"); - - if (!encodedUrlPath.equals(encodedRequestPath)) { - logger.warn("request_url's path (" + url.getPath() + " - " + encodedUrlPath - + ") does not match request_path (" + requestPath + " - " + encodedRequestPath - + ")"); - } - } - log.put("request_url_path", url.getPath()); - - String[] pathLevels = StringUtils.split(url.getPath(), "/", 6); - for (int i = 0; i < pathLevels.length; i++) { - String pathLevel = pathLevels[i]; - if (i < pathLevels.length - 1) { - pathLevel = pathLevel + "/"; - } - if (i == 0 && pathLevel != "/") { - pathLevel = "/" + pathLevel; - } - log.put("request_url_path_level" + (i + 1), pathLevel); - } - - // Store the query string extracted from the full URL. - String requestQuery = url.getQuery(); - log.put("request_url_query", requestQuery); - - // If a hash fragment is present in the full URL, this is actually a - // flag that something's fishy with the URL encoding, since our - // server-side logs can't possible contain fragment information. So - // we'll assume this information actually represents something - // following a URL-encoded hash fragment, and append that to the - // appropriate place. - String urlRef = url.getRef(); - if (urlRef != null) { - if (log.get("request_url_query") != null) { - log.put("request_url_query", log.get("request_url_query") + "%23" + urlRef); - } else { - log.put("request_url_path", log.get("request_url_path") + "%23" + urlRef); - } - } - - // Re-assemble the URL based on all of our newly stored individual - // componetns. - String reassmbledUrl = - log.get("request_url_scheme") + "://" + log.get("request_url_host"); - if (log.get("request_url_scheme").equals("http")) { - if ((Integer) log.get("request_url_port") != 80) { - reassmbledUrl = reassmbledUrl + ":" + log.get("request_url_port"); - } - } else if (log.get("request_url_scheme").equals("https")) { - if ((Integer) log.get("request_url_port") != 443) { - reassmbledUrl = reassmbledUrl + ":" + log.get("request_url_port"); - } - } else { - reassmbledUrl = reassmbledUrl + ":" + log.get("request_url_port"); - } - reassmbledUrl = reassmbledUrl + log.get("request_url_path"); - if (requestQuery != null) { - reassmbledUrl = reassmbledUrl + "?" + log.get("request_url_query"); - } - - // As a last sanity check to make sure we're not throwing away data as - // part of this migration, compare the original full URL string to the - // new URL composed of the various parts. - if (!value.getAsString().equals(reassmbledUrl)) { - // Ignore some of the default ports for comparison. - String compareUrl = value.getAsString().replaceFirst(":(80|443|50090)/", "/"); - - // Ignore url encoding of the hash fragment. - compareUrl = compareUrl.replaceFirst("#", "%23"); - - // Ignore case-sensitivity on the Host. - Pattern pattern = Pattern.compile("://(.+?)/"); - Matcher matcher = pattern.matcher(compareUrl); - StringBuffer buffer = new StringBuffer(); - while (matcher.find()) { - matcher.appendReplacement(buffer, - Matcher.quoteReplacement("://" + matcher.group(1).toLowerCase() + "/")); - } - matcher.appendTail(buffer); - compareUrl = buffer.toString(); - - if (!compareUrl.equals(reassmbledUrl)) { - logger.warn("request_url (" + value.getAsString() + " - " + compareUrl - + ") does not match reassembled URL (" + reassmbledUrl + ")"); - } - } - - value = null; - break; - - // The following are some renamed fields to better normalize the new - // storage schema. - case "response_time": - key = "timer_response"; - break; - case "backend_response_time": - key = "timer_backend_response"; - break; - case "internal_gatekeeper_time": - key = "timer_internal"; - break; - case "proxy_overhead": - key = "timer_proxy_overhead"; - break; - case "gatekeeper_denied_code": - key = "denied_reason"; - break; - case "imported": - key = "log_imported"; - break; - } - - // Handle empty strings values. - if (value != null && value.isJsonPrimitive() && value.getAsJsonPrimitive().isString() - && value.getAsJsonPrimitive().getAsString().equals("")) { - switch (key) { - // Replace empty string request_ips with 127.0.0.1. There's not a lot - // of these empty string values, but since this is considered a - // required field moving forward, let's make sure we have at least - // some real value in there. - case "request_ip": - logger.warn( - key + " contains empty string value: replacing with 127.0.0.1 since NULLs are not allowed on this field: " - + hit); - value = new JsonPrimitive("127.0.0.1"); - break; - - // Set empty string values to null. - // - // I think in all of our cases, nulls are more appropriate, we just - // have empty strings in some of our source data due to various - // migrations. Kylin (v1.2) also seems to treat empty strings as NULLs - // in the rollups, so for consistency sake, we'll avoid them. - default: - value = null; - break; - } - } - - if (value != null) { - // Set the value on the new record, performing type-casting as needed. - try { - Schema.Type type = schema.getFieldType(key); - if (type == Schema.Type.INT) { - log.put(key, value.getAsInt()); - } else if (type == Schema.Type.LONG) { - log.put(key, value.getAsLong()); - } else if (type == Schema.Type.DOUBLE) { - log.put(key, value.getAsDouble()); - } else if (type == Schema.Type.BOOLEAN) { - log.put(key, value.getAsBoolean()); - } else { - try { - log.put(key, value.getAsString()); - } catch (IllegalStateException e) { - // Handle some unexpected array types by comma-delimiting the - // values. - if (value.isJsonArray()) { - StringBuffer buffer = new StringBuffer(); - Iterator iter = value.getAsJsonArray().iterator(); - while (iter.hasNext()) { - buffer.append(iter.next().getAsString()); - if (iter.hasNext()) { - buffer.append(", "); - } - } - log.put(key, buffer.toString()); - } else { - throw (e); - } - } - } - } catch (Exception e) { - logger.error("Eror on field: " + key); - logger.error(e.getMessage()); - throw (e); - } - } - } - - OrcRow orcRecord = new OrcRow(schema.getNonPartitionFieldsList().size()); - for (int i = 0; i < schema.getNonPartitionFieldsList().size(); i++) { - String field = schema.getNonPartitionFieldsList().get(i); - Schema.Type type = schema.getFieldType(field); - Object rawValue = log.get(field); - Object value; - - if (rawValue != null) { - if (type == Schema.Type.INT) { - if (schema.isFieldTypeShort(field)) { - value = new ShortWritable(((Integer) rawValue).shortValue()); - } else { - value = new IntWritable((int) rawValue); - } - } else if (type == Schema.Type.LONG) { - value = new LongWritable((long) rawValue); - } else if (type == Schema.Type.DOUBLE) { - value = new DoubleWritable((double) rawValue); - } else if (type == Schema.Type.BOOLEAN) { - value = new BooleanWritable((boolean) rawValue); - } else if (type == Schema.Type.STRING) { - value = new Text((String) rawValue); - } else { - throw new IOException("Unknown type: " + type.toString()); - } - - orcRecord.setFieldValue(i, value); - } - } - - this.getOrcWriter().addRow(orcRecord); - } catch (Exception e) { - logger.error("Error on hit: " + hit); - logger.error(e.getMessage()); - throw (e); - } - } - - private DateTime parseTimestamp(JsonElement value) { - DateTime date; - if (value.getAsJsonPrimitive().isNumber()) { - date = new DateTime(value.getAsLong(), DateTimeZone.UTC); - } else { - date = this.dateTimeParser.parseDateTime(value.getAsString()); - } - return date.withZone(App.TIMEZONE); - } -} diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcField.java b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcField.java deleted file mode 100644 index 0f734484b..000000000 --- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcField.java +++ /dev/null @@ -1,37 +0,0 @@ -package apiumbrella.hadoop_analytics; - -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; - -// From https://gist.github.com/omalley/ccabae7cccac28f64812 -public class OrcField implements StructField { - private final String name; - private final ObjectInspector inspector; - final int offset; - - OrcField(String name, ObjectInspector inspector, int offset) { - this.name = name; - this.inspector = inspector; - this.offset = offset; - } - - @Override - public String getFieldName() { - return name; - } - - @Override - public ObjectInspector getFieldObjectInspector() { - return inspector; - } - - @Override - public int getFieldID() { - return offset; - } - - @Override - public String getFieldComment() { - return null; - } -} diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcRow.java b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcRow.java deleted file mode 100644 index 9b0cb126c..000000000 --- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcRow.java +++ /dev/null @@ -1,22 +0,0 @@ -package apiumbrella.hadoop_analytics; - -// From https://gist.github.com/omalley/ccabae7cccac28f64812 -public class OrcRow { - public Object[] columns; - - OrcRow(int colCount) { - columns = new Object[colCount]; - } - - void setFieldValue(int FieldIndex, Object value) { - columns[FieldIndex] = value; - } - - void setNumFields(int newSize) { - if (newSize != columns.length) { - Object[] oldColumns = columns; - columns = new Object[newSize]; - System.arraycopy(oldColumns, 0, columns, 0, oldColumns.length); - } - } -} diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcRowInspector.java b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcRowInspector.java deleted file mode 100644 index 829daa350..000000000 --- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcRowInspector.java +++ /dev/null @@ -1,121 +0,0 @@ -package apiumbrella.hadoop_analytics; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; - -// From https://gist.github.com/omalley/ccabae7cccac28f64812 -public class OrcRowInspector extends SettableStructObjectInspector { - private List fields; - - public OrcRowInspector(List fields) { - super(); - this.fields = fields; - } - - @Override - public List getAllStructFieldRefs() { - return fields; - } - - @Override - public StructField getStructFieldRef(String s) { - for (StructField field : fields) { - if (field.getFieldName().equalsIgnoreCase(s)) { - return field; - } - } - return null; - } - - @Override - public Object getStructFieldData(Object object, StructField field) { - if (object == null) { - return null; - } - int offset = ((OrcField) field).offset; - OrcRow struct = (OrcRow) object; - if (offset >= struct.columns.length) { - return null; - } - - return struct.columns[offset]; - } - - @Override - public List getStructFieldsDataAsList(Object object) { - if (object == null) { - return null; - } - OrcRow struct = (OrcRow) object; - List result = new ArrayList(struct.columns.length); - for (Object child : struct.columns) { - result.add(child); - } - return result; - } - - @Override - public String getTypeName() { - StringBuilder buffer = new StringBuilder(); - buffer.append("struct<"); - for (int i = 0; i < fields.size(); ++i) { - StructField field = fields.get(i); - if (i != 0) { - buffer.append(","); - } - buffer.append(field.getFieldName()); - buffer.append(":"); - buffer.append(field.getFieldObjectInspector().getTypeName()); - } - buffer.append(">"); - return buffer.toString(); - } - - @Override - public Category getCategory() { - return Category.STRUCT; - } - - @Override - public Object create() { - return new OrcRow(0); - } - - @Override - public Object setStructFieldData(Object struct, StructField field, Object fieldValue) { - OrcRow orcStruct = (OrcRow) struct; - int offset = ((OrcField) field).offset; - // if the offset is bigger than our current number of fields, grow it - if (orcStruct.columns.length <= offset) { - orcStruct.setNumFields(offset + 1); - } - orcStruct.setFieldValue(offset, fieldValue); - return struct; - } - - @Override - public boolean equals(Object o) { - if (o == null || o.getClass() != getClass()) { - return false; - } else if (o == this) { - return true; - } else { - List other = ((OrcRowInspector) o).fields; - if (other.size() != fields.size()) { - return false; - } - for (int i = 0; i < fields.size(); ++i) { - StructField left = other.get(i); - StructField right = fields.get(i); - if (!(left.getFieldName().equalsIgnoreCase(right.getFieldName()) - && left.getFieldObjectInspector().equals(right.getFieldObjectInspector()))) { - return false; - } - } - return true; - } - } -} diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/resources/simplelogger.properties b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/resources/simplelogger.properties deleted file mode 100644 index 780ec143c..000000000 --- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/resources/simplelogger.properties +++ /dev/null @@ -1,6 +0,0 @@ -org.slf4j.simpleLogger.logFile=log/elasticsearch-import.log -org.slf4j.simpleLogger.defaultLogLevel=warn -org.slf4j.simpleLogger.log.apiumbrella=info -org.slf4j.simpleLogger.showDateTime=true -org.slf4j.simpleLogger.dateTimeFormat=yyyy-MM-dd'T'HH:mm:ss.SSSZ" -org.slf4j.simpleLogger.showLogName=false \ No newline at end of file diff --git a/src/api-umbrella/hadoop-analytics/pom.xml b/src/api-umbrella/hadoop-analytics/pom.xml deleted file mode 100644 index 85f4f5905..000000000 --- a/src/api-umbrella/hadoop-analytics/pom.xml +++ /dev/null @@ -1,30 +0,0 @@ - - 4.0.0 - - api-umbrella.hadoop-analytics - hadoop-analytics - 0.0.1-SNAPSHOT - pom - - hadoop-analytics - http://maven.apache.org - - - UTF-8 - target - - - - - - - ${buildDir} - - - - elasticsearch-import - processor - schema - - diff --git a/src/api-umbrella/hadoop-analytics/processor/.gitignore b/src/api-umbrella/hadoop-analytics/processor/.gitignore deleted file mode 100644 index 7e5babc1e..000000000 --- a/src/api-umbrella/hadoop-analytics/processor/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -/.classpath -/.project -/.settings -/dependency-reduced-pom.xml -/target/ diff --git a/src/api-umbrella/hadoop-analytics/processor/pom.xml b/src/api-umbrella/hadoop-analytics/processor/pom.xml deleted file mode 100644 index b4a144045..000000000 --- a/src/api-umbrella/hadoop-analytics/processor/pom.xml +++ /dev/null @@ -1,107 +0,0 @@ - - 4.0.0 - - - api-umbrella.hadoop-analytics - hadoop-analytics - 0.0.1-SNAPSHOT - - processor - jar - - processor - http://maven.apache.org - - - UTF-8 - - - - - - api-umbrella.hadoop-analytics - schema - 0.0.1-SNAPSHOT - - - - joda-time - joda-time - 2.9.2 - - - - org.slf4j - slf4j-log4j12 - 1.7.21 - - - - org.apache.hive - hive-jdbc - 0.14.0 - - - - org.apache.hadoop - hadoop-client - 2.6.0 - - - - org.quartz-scheduler - quartz - 2.2.3 - - - org.quartz-scheduler - quartz-jobs - 2.2.3 - - - - - ${buildDir}/processor - - - - org.apache.maven.plugins - maven-shade-plugin - 2.4.3 - - - package - - shade - - - - - - apiumbrella.hadoop_analytics.App - - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - - - - diff --git a/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/App.java b/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/App.java deleted file mode 100644 index 26672a1cb..000000000 --- a/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/App.java +++ /dev/null @@ -1,210 +0,0 @@ -package apiumbrella.hadoop_analytics; - -import java.io.IOException; -import java.net.URI; -import java.net.URISyntaxException; -import java.sql.Connection; -import java.sql.DriverManager; -import java.sql.PreparedStatement; -import java.sql.SQLException; -import java.util.HashMap; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.ISODateTimeFormat; -import org.quartz.CronScheduleBuilder; -import org.quartz.JobBuilder; -import org.quartz.JobDetail; -import org.quartz.Scheduler; -import org.quartz.SchedulerException; -import org.quartz.SimpleScheduleBuilder; -import org.quartz.Trigger; -import org.quartz.TriggerBuilder; -import org.quartz.impl.StdSchedulerFactory; -import org.slf4j.Logger; - -public class App { - private static boolean DISABLE_LIVE_DATA_CONVERSION = - Boolean.getBoolean("apiumbrella.disable_live_data_conversion"); - private static boolean DISABLE_KYLIN_REFRESH = - Boolean.getBoolean("apiumbrella.disable_kylin_refresh"); - protected static DateTimeZone TIMEZONE = - DateTimeZone.forID(System.getProperty("apiumbrella.timezone", "UTC")); - protected static final String HDFS_URI = - System.getProperty("apiumbrella.hdfs_uri", "hdfs://127.0.0.1:8020"); - protected static final String HDFS_ROOT = "/apps/api-umbrella"; - protected static final String HDFS_LOGS_ROOT = HDFS_ROOT + "/logs"; - protected static final String HDFS_LOGS_LIVE_ROOT = App.HDFS_ROOT + "/logs-live"; - protected static final String LOGS_TABLE_NAME = "api_umbrella.logs"; - protected static final String LOGS_LIVE_TABLE_NAME = "api_umbrella.logs_live"; - - public void run() throws SchedulerException { - Scheduler scheduler = StdSchedulerFactory.getDefaultScheduler(); - scheduler.start(); - - if (!DISABLE_LIVE_DATA_CONVERSION) { - /* - * Job to convert the live log data (coming from Kafka & Flume) into the ORC files for - * querying and long-term storage. - */ - JobDetail convertJob = JobBuilder.newJob(ConvertLiveDataToOrc.class) - .withIdentity("convertLiveDataJob").usingJobData("lastMigratedPartitionTime", 0L) - .usingJobData("lastConcatenateTime", 0L).usingJobData("tablesCreated", false).build(); - /* Run every 30 seconds */ - Trigger convertTrigger = - TriggerBuilder.newTrigger().withIdentity("convertLiveDataTrigger").startNow() - .withSchedule( - SimpleScheduleBuilder.simpleSchedule().withIntervalInSeconds(30).repeatForever()) - .build(); - scheduler.scheduleJob(convertJob, convertTrigger); - } - - if (!DISABLE_KYLIN_REFRESH) { - /* Job to process the new log data into the pre-aggregated Kylin cubes. */ - JobDetail refreshJob = JobBuilder.newJob(RefreshKylin.class).withIdentity("refreshKylinJob") - .storeDurably().build(); - /* - * Run this job once immediately on boot to catch up with any new data that needs processing. - */ - Trigger refreshImmediateTrigger = TriggerBuilder.newTrigger() - .withIdentity("refreshKylinImmediateTrigger").forJob(refreshJob).startNow() - .withSchedule(SimpleScheduleBuilder.simpleSchedule()).build(); - /* - * After the initial run, run this every morning at 01:00 in the timezone used for data. This - * assumes that by 01:00 there will no longer be any data writing to the previous day's - * partition, so it's safe to fully process that day into Kylin. - */ - Trigger refreshDailyTrigger = TriggerBuilder.newTrigger() - .withIdentity("refreshKylinDailyTrigger").forJob(refreshJob).startNow() - .withSchedule( - CronScheduleBuilder.dailyAtHourAndMinute(1, 0).inTimeZone(TIMEZONE.toTimeZone())) - .build(); - scheduler.addJob(refreshJob, false); - scheduler.scheduleJob(refreshImmediateTrigger); - scheduler.scheduleJob(refreshDailyTrigger); - } - - scheduler.start(); - } - - public static void main(String[] args) throws SchedulerException { - if (System.getProperty("apiumbrella.root_log_level") == null) { - System.setProperty("apiumbrella.root_log_level", "WARN"); - } - - if (System.getProperty("apiumbrella.log_level") == null) { - System.setProperty("apiumbrella.log_level", "INFO"); - } - - App app = new App(); - app.run(); - } - - public static FileSystem getHadoopFileSystem() throws IOException, URISyntaxException { - Configuration conf = new Configuration(); - conf.setInt("ipc.client.connect.max.retries.on.timeouts", 3); - /* Fix for hadoop jar ordering: http://stackoverflow.com/a/21118824 */ - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - - return FileSystem.get(new URI(HDFS_URI), conf); - } - - public static Connection getHiveConnection() throws ClassNotFoundException, SQLException { - /* Load the Hive JDBC class. */ - Class.forName("org.apache.hive.jdbc.HiveDriver"); - - return DriverManager.getConnection("jdbc:hive2://localhost:10000/api_umbrella", "hive", ""); - } - - /** - * Find any partitions in the api_umbrella.logs table that are composed of more than 1 file and - * concatenate the partitions to merge them down to 1 file. This helps improve query performance - * on the historical data. - * - * @param logger - * @throws ClassNotFoundException - * @throws SQLException - * @throws IOException - * @throws IllegalArgumentException - * @throws URISyntaxException - */ - public static synchronized void concatenateTablePartitions(Logger logger) - throws ClassNotFoundException, SQLException, IOException, IllegalArgumentException, - URISyntaxException { - logger.debug("Begin concatenate table partitions"); - - /* Build the CONCATENATE SQL command. */ - LogSchema logSchema = new LogSchema(); - StringBuilder sql = new StringBuilder(); - sql.append("ALTER TABLE " + App.LOGS_TABLE_NAME + " PARTITION("); - for (int i = 0; i < logSchema.getPartitionFieldsList().size(); i++) { - if (i > 0) { - sql.append(","); - } - sql.append(logSchema.getPartitionFieldsList().get(i) + "=?"); - } - sql.append(") CONCATENATE"); - - /* - * Iterate over all the partition files to detect partitions that contain more than 1 file. - */ - RemoteIterator filesIter = - getHadoopFileSystem().listFiles(new Path(HDFS_URI + HDFS_LOGS_ROOT), true); - Pattern partitionDatePattern = Pattern.compile(".*timestamp_tz_date=(\\d{4}-\\d{2}-\\d{2})"); - HashMap partitionFileCounts = new HashMap(); - while (filesIter.hasNext()) { - FileStatus file = filesIter.next(); - Path partition = file.getPath().getParent(); - Matcher matcher = partitionDatePattern.matcher(partition.toString()); - DateTime partitionTime = null; - while (matcher.find()) { - partitionTime = new DateTime(matcher.group(1) + "T00:00:00"); - } - - Integer count = partitionFileCounts.get(partitionTime); - if (count == null) { - count = 0; - } - partitionFileCounts.put(partitionTime, count + 1); - } - - /* - * Iterate over all the partitions with more than 1 file and run the concatenate command against - * them. - */ - Connection connection = null; - DateTimeFormatter dateFormatter = ISODateTimeFormat.date(); - for (Map.Entry entry : partitionFileCounts.entrySet()) { - DateTime partition = entry.getKey(); - Integer count = entry.getValue(); - logger.debug("Partition " + partition + ": " + count + " files"); - if (count > 1) { - if (connection == null) { - connection = getHiveConnection(); - } - - logger.info("Concatenating table partition " + partition + " (" + count + " files)"); - PreparedStatement concatenate = connection.prepareStatement(sql.toString()); - concatenate.setString(1, dateFormatter.print(partition.withDayOfYear(1))); - concatenate.setString(2, dateFormatter.print(partition.withDayOfMonth(1))); - concatenate.setString(3, dateFormatter.print(partition.withDayOfWeek(1))); - concatenate.setString(4, dateFormatter.print(partition)); - concatenate.executeUpdate(); - concatenate.close(); - } - } - - logger.debug("Finish concatenate table partitions"); - } -} diff --git a/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/ConvertLiveDataToOrc.java b/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/ConvertLiveDataToOrc.java deleted file mode 100644 index d702d1a4e..000000000 --- a/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/ConvertLiveDataToOrc.java +++ /dev/null @@ -1,496 +0,0 @@ -package apiumbrella.hadoop_analytics; - -import java.io.BufferedWriter; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.net.URISyntaxException; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.SQLException; -import java.sql.Statement; -import java.util.TreeSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.joda.time.DateTime; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.ISODateTimeFormat; -import org.quartz.DisallowConcurrentExecution; -import org.quartz.Job; -import org.quartz.JobDataMap; -import org.quartz.JobExecutionContext; -import org.quartz.JobExecutionException; -import org.quartz.PersistJobDataAfterExecution; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This class is responsible for migrating the live log data into permanent ORC storage (with likely - * a 2-3 minute delay). - * - * Live log data is sent by the application servers in the following fashion: - * - * [nginx] --json--> [rsyslog] --json--> [flume] --json--> [hdfs] - * - * Flume stores the JSON files in directories partitioned by minute. This background job then runs - * periodically to migrate this per-minute JSON data into the permanent ORC storage. The data is - * migrated by appending each minute's data to the ORC table, which is partitioned by day. With this - * approach, it may take around 2-3 minutes for the data to become populated in the ORC table (since - * we have to wait until we're sure the minute partition is no longer being written to). - * - * Note: A cleaner and more live solution would involve populating the ORC table directly using Hive - * transaction streaming: https://cwiki.apache.org/confluence/display/Hive/Streaming+Data+Ingest - * However, since we're on Hive 0.14 (for Kylin & HDP 2.2 compatibility), there are several bugs in - * the Hive streaming implementation that impact us - * (https://issues.apache.org/jira/browse/HIVE-8966, - * https://issues.apache.org/jira/browse/HIVE-11540, - * https://issues.apache.org/jira/browse/HIVE-5143), and Flume's Hive integration is also currently - * experimental (I've run into some issues and high memory use with it). So for now, we'll use this - * more manual approach that can work with the older version of Hive, but in the future this may be - * worth revisiting. - */ -@DisallowConcurrentExecution -@PersistJobDataAfterExecution -public class ConvertLiveDataToOrc implements Job { - private static final Path LAST_MIGRATED_MARKER = - new Path(App.HDFS_URI + App.HDFS_ROOT + "/.logs-live-last-migrated-partition-time"); - final Logger logger; - - Pattern partitionDatePattern = Pattern.compile( - ".*timestamp_tz_date=(\\d{4}-\\d{2}-\\d{2})/timestamp_tz_hour_minute=(\\d{2})-(\\d{2}).*"); - DateTimeFormatter dateFormatter = ISODateTimeFormat.date(); - DateTimeFormatter hourMinuteFormatter = DateTimeFormat.forPattern("HH-mm"); - private static LogSchema logSchema = new LogSchema(); - private static String createExternalTableSql; - private static String createLiveExternalTableSql; - private static String addPartitionSql; - private static String addLivePartitionSql; - private static String dropLivePartitionSql; - private static String insertAppendSql; - private static String insertOverwriteSql; - private FileSystem fileSystem; - private long lastMigratedPartitionTime = 0; - - public ConvertLiveDataToOrc() { - logger = LoggerFactory.getLogger(this.getClass()); - logger.debug("Initializing " + this.getClass()); - } - - public void execute(JobExecutionContext context) throws JobExecutionException { - try { - run(context); - } catch (Exception e) { - logger.error("Convert live data to ORC error", e); - JobExecutionException jobError = new JobExecutionException(e); - throw jobError; - } - } - - public void run(JobExecutionContext context) throws SQLException, IOException, - ClassNotFoundException, IllegalArgumentException, URISyntaxException { - logger.debug("Begin processing"); - - JobDataMap jobData = context.getJobDetail().getJobDataMap(); - lastMigratedPartitionTime = jobData.getLong("lastMigratedPartitionTime"); - - fileSystem = App.getHadoopFileSystem(); - if (lastMigratedPartitionTime == 0) { - if (fileSystem.exists(LAST_MIGRATED_MARKER)) { - FSDataInputStream markerInputStream = fileSystem.open(LAST_MIGRATED_MARKER); - lastMigratedPartitionTime = Long.parseLong(IOUtils.toString(markerInputStream), 10); - logger.debug("Read last migrated partition timestamp marker: " + lastMigratedPartitionTime); - } - } - - Connection connection = null; - try { - connection = App.getHiveConnection(); - createTables(jobData, connection); - - for (DateTime partition : getInactivePartitions()) { - if (partition.getMillis() > lastMigratedPartitionTime) { - logger.info("Migrating partition: " + partition); - - PreparedStatement addPartition = connection.prepareStatement(getAddPartitionSql()); - addPartition.setString(1, dateFormatter.print(partition.withDayOfYear(1))); - addPartition.setString(2, dateFormatter.print(partition.withDayOfMonth(1))); - addPartition.setString(3, dateFormatter.print(partition.withDayOfWeek(1))); - addPartition.setString(4, dateFormatter.print(partition)); - addPartition.executeUpdate(); - addPartition.close(); - - PreparedStatement addLivePartition = - connection.prepareStatement(getAddLivePartitionSql()); - addLivePartition.setString(1, dateFormatter.print(partition)); - addLivePartition.setString(2, hourMinuteFormatter.print(partition)); - addLivePartition.executeUpdate(); - addLivePartition.close(); - - PreparedStatement migrate = connection.prepareStatement(getInsertAppendSql()); - migrate.setString(1, dateFormatter.print(partition.withDayOfYear(1))); - migrate.setString(2, dateFormatter.print(partition.withDayOfMonth(1))); - migrate.setString(3, dateFormatter.print(partition.withDayOfWeek(1))); - migrate.setString(4, dateFormatter.print(partition)); - migrate.setString(5, dateFormatter.print(partition)); - migrate.setString(6, hourMinuteFormatter.print(partition)); - migrate.executeUpdate(); - migrate.close(); - - lastMigratedPartitionTime = partition.getMillis(); - jobData.put("lastMigratedPartitionTime", lastMigratedPartitionTime); - - FSDataOutputStream markerOutputStream = fileSystem.create(LAST_MIGRATED_MARKER, true); - BufferedWriter br = new BufferedWriter(new OutputStreamWriter(markerOutputStream)); - br.write(Long.toString(lastMigratedPartitionTime)); - br.close(); - } else { - logger.debug("Skipping already processed partition: " + partition); - } - } - - jobData.put("lastMigratedPartitionTime", lastMigratedPartitionTime); - - /* - * Since we're appending the live data to the ORC files every minute, we end up with many - * small ORC files in the api_umbrella.logs partitions. To help improve query performance, - * concatenate the table partitions ever 1 hour (which merges the existing files down to a - * single file). - */ - long now = DateTime.now().getMillis(); - long lastConcatenateTime = jobData.getLong("lastConcatenateTime"); - if (now - lastConcatenateTime > 1 * 60 * 60 * 1000) { - App.concatenateTablePartitions(logger); - jobData.put("lastConcatenateTime", now); - } - } finally { - if (connection != null) { - connection.close(); - } - } - - logger.debug("Finish processing"); - } - - /** - * Create either of the external tables - * - * @param connection - * @throws SQLException - */ - private void createTables(JobDataMap jobData, Connection connection) throws SQLException { - boolean tablesCreated = jobData.getBoolean("tablesCreated"); - if (tablesCreated == false) { - Statement statement = connection.createStatement(); - - logger.info("Creating table (if not exists): " + App.LOGS_TABLE_NAME); - statement.executeUpdate(getCreateExternalTableSql()); - - logger.info("Creating table (if not exists): " + App.LOGS_LIVE_TABLE_NAME); - statement.executeUpdate(getCreateLiveExternalTableSql()); - - statement.close(); - jobData.put("tablesCreated", true); - } - } - - /** - * Generate the SQL string for creating the external "api_umbrella.logs" table in Hive. This is - * the table that stores all the data in the definitive format used for queries and processing. - * - * The columns are generated based on the Avro schema definition (src/main/resources/log.avsc) for - * our log data to ensure the schemas match across the different tables. - * - * @return - */ - private static String getCreateExternalTableSql() { - if (createExternalTableSql == null) { - StringBuilder sql = new StringBuilder(); - sql.append("CREATE EXTERNAL TABLE IF NOT EXISTS " + App.LOGS_TABLE_NAME + "("); - for (int i = 0; i < logSchema.getNonPartitionFieldsList().size(); i++) { - if (i > 0) { - sql.append(","); - } - String field = logSchema.getNonPartitionFieldsList().get(i); - sql.append(field + " " + logSchema.getFieldHiveType(field)); - } - sql.append(") "); - - sql.append("PARTITIONED BY("); - for (int i = 0; i < logSchema.getPartitionFieldsList().size(); i++) { - if (i > 0) { - sql.append(","); - } - String field = logSchema.getPartitionFieldsList().get(i); - sql.append(field + " " + logSchema.getFieldHiveType(field)); - } - sql.append(") "); - - sql.append("STORED AS ORC LOCATION '" + App.HDFS_LOGS_ROOT + "'"); - - createExternalTableSql = sql.toString(); - } - - return createExternalTableSql; - } - - /** - * Generate the SQL string for creating the external "api_umbrella.logs_live" table in Hive. This - * is the table that stores temporary data, partitioned by minute, that Flume is writing to HDFS - * JSON files to. - * - * The columns are generated based on the Avro schema definition (src/main/resources/log.avsc) for - * our log data to ensure the schemas match across the different tables. - * - * @return - */ - private static String getCreateLiveExternalTableSql() { - if (createLiveExternalTableSql == null) { - StringBuilder sql = new StringBuilder(); - sql.append("CREATE EXTERNAL TABLE IF NOT EXISTS " + App.LOGS_LIVE_TABLE_NAME + "("); - for (int i = 0; i < logSchema.getLiveNonPartitionFieldsList().size(); i++) { - if (i > 0) { - sql.append(","); - } - String field = logSchema.getLiveNonPartitionFieldsList().get(i); - sql.append(field + " " + logSchema.getFieldHiveType(field)); - } - sql.append(") "); - - sql.append("PARTITIONED BY("); - for (int i = 0; i < logSchema.getLivePartitionFieldsList().size(); i++) { - if (i > 0) { - sql.append(","); - } - String field = logSchema.getLivePartitionFieldsList().get(i); - sql.append(field + " " + logSchema.getFieldHiveType(field)); - } - sql.append(") "); - - sql.append("ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' "); - sql.append("STORED AS TEXTFILE LOCATION '" + App.HDFS_LOGS_LIVE_ROOT + "'"); - - createLiveExternalTableSql = sql.toString(); - } - - return createLiveExternalTableSql; - } - - /** - * Generate the SQL string for adding new date partitions to the "api_umbrella.logs" table. - * - * @return - */ - private static String getAddPartitionSql() { - if (addPartitionSql == null) { - StringBuilder sql = new StringBuilder(); - sql.append("ALTER TABLE " + App.LOGS_TABLE_NAME + " ADD IF NOT EXISTS "); - sql.append("PARTITION("); - for (int i = 0; i < logSchema.getPartitionFieldsList().size(); i++) { - if (i > 0) { - sql.append(","); - } - sql.append(logSchema.getPartitionFieldsList().get(i) + "=?"); - } - sql.append(")"); - - addPartitionSql = sql.toString(); - } - - return addPartitionSql; - } - - /** - * Generate the SQL string for adding new minute partitions to the "api_umbrella.logs_live" table. - * - * @return - */ - private static String getAddLivePartitionSql() { - if (addLivePartitionSql == null) { - StringBuilder sql = new StringBuilder(); - sql.append("ALTER TABLE " + App.LOGS_LIVE_TABLE_NAME + " ADD IF NOT EXISTS "); - sql.append("PARTITION("); - for (int i = 0; i < logSchema.getLivePartitionFieldsList().size(); i++) { - if (i > 0) { - sql.append(","); - } - sql.append(logSchema.getLivePartitionFieldsList().get(i) + "=?"); - } - sql.append(")"); - - addLivePartitionSql = sql.toString(); - } - - return addLivePartitionSql; - } - - protected static String getDropLivePartitionsDaySql() { - if (dropLivePartitionSql == null) { - StringBuilder sql = new StringBuilder(); - sql.append("ALTER TABLE " + App.LOGS_TABLE_NAME + " DROP IF EXISTS "); - sql.append("PARTITION(timestamp_tz_date=?)"); - - dropLivePartitionSql = sql.toString(); - } - - return dropLivePartitionSql; - } - - /** - * Generate the SQL string that copies a minute worth of data from the "api_umbrella.logs_live" - * table (JSON storage) to the real "api_umbrella.logs" table (ORC storage). - * - * Data is appended to the ORC table's full-day partition, so it's expected that this will run - * repeatedly for each new minute of data, but the same minute should not be repeated (or else the - * same data will be appended twice to the ORC table). - * - * @return - */ - private static String getInsertAppendSql() { - if (insertAppendSql == null) { - StringBuilder sql = new StringBuilder(); - sql.append("INSERT INTO TABLE " + App.LOGS_TABLE_NAME + " "); - - sql.append("PARTITION("); - for (int i = 0; i < logSchema.getPartitionFieldsList().size(); i++) { - if (i > 0) { - sql.append(","); - } - sql.append(logSchema.getPartitionFieldsList().get(i) + "=?"); - } - sql.append(") "); - - sql.append("SELECT "); - for (int i = 0; i < logSchema.getNonPartitionFieldsList().size(); i++) { - if (i > 0) { - sql.append(","); - } - sql.append(logSchema.getNonPartitionFieldsList().get(i)); - } - - sql.append(" FROM " + App.LOGS_LIVE_TABLE_NAME + " WHERE "); - for (int i = 0; i < logSchema.getLivePartitionFieldsList().size(); i++) { - if (i > 0) { - sql.append(" AND "); - } - sql.append(logSchema.getLivePartitionFieldsList().get(i) + "=?"); - } - sql.append(" ORDER BY timestamp_utc, id"); - - insertAppendSql = sql.toString(); - } - - return insertAppendSql; - } - - protected static String getInsertOverwriteDaySql() { - if (insertOverwriteSql == null) { - StringBuilder sql = new StringBuilder(); - sql.append("INSERT OVERWRITE TABLE " + App.LOGS_TABLE_NAME + " "); - - sql.append("PARTITION("); - for (int i = 0; i < logSchema.getPartitionFieldsList().size(); i++) { - if (i > 0) { - sql.append(","); - } - sql.append(logSchema.getPartitionFieldsList().get(i) + "=?"); - } - sql.append(") "); - - sql.append("SELECT DISTINCT "); - for (int i = 0; i < logSchema.getNonPartitionFieldsList().size(); i++) { - if (i > 0) { - sql.append(","); - } - sql.append(logSchema.getNonPartitionFieldsList().get(i)); - } - - sql.append(" FROM " + App.LOGS_LIVE_TABLE_NAME + " WHERE timestamp_tz_date=?"); - sql.append(" ORDER BY timestamp_utc, id"); - - insertOverwriteSql = sql.toString(); - } - - return insertOverwriteSql; - } - - /** - * Return all the minute partitions within the "api-umbrella.logs_live" table that are no longer - * being written to. - * - * This is determined by looking at all of the underlying HDFS files that Flume is writing to and - * seeing which partitions haven't been touched in over 1 minute. - * - * @return - * @throws IOException - */ - private TreeSet getInactivePartitions() throws IOException { - RemoteIterator filesIter = - fileSystem.listFiles(new Path(App.HDFS_URI + App.HDFS_LOGS_LIVE_ROOT), true); - - TreeSet inactivePartitions = new TreeSet(); - TreeSet activePartitions = new TreeSet(); - long inactiveTimeThreshold = new DateTime().minusMinutes(1).getMillis(); - while (filesIter.hasNext()) { - FileStatus file = filesIter.next(); - Path partition = file.getPath().getParent(); - Matcher matcher = partitionDatePattern.matcher(partition.toString()); - DateTime partitionTime = null; - while (matcher.find()) { - partitionTime = new DateTime( - matcher.group(1) + "T" + matcher.group(2) + ":" + matcher.group(3) + ":00"); - } - - /* - * If the file was recently written to, mark the partition as active (since we don't want to - * touch it until we're sure no more data will show up in it). - */ - if (file.getModificationTime() > inactiveTimeThreshold) { - activePartitions.add(partitionTime); - } else { - /* - * If the file is older, but currently being written to (denoted by Flume prefixing the file - * with an "_" so it's ignored from Hive), then we also want to ignore it. However, this - * case should be rare, since it would indicate Flume is writing to an older file than we - * expect. - */ - if (file.getPath().getName().startsWith("_")) { - activePartitions.add(partitionTime); - } else { - inactivePartitions.add(partitionTime); - } - } - } - - /* - * Loop through all the active partitions and ensure that none of these are also marked as - * inactive. This handles situations where the partition might have some older files, but other - * files still being written to. - */ - for (DateTime activePartition : activePartitions) { - logger.debug("Active partition being written to: " + activePartition); - inactivePartitions.remove(activePartition); - - /* - * If data is unexpectedly coming in out-of-order, log this as an error, since this must - * manually be dealt with. - */ - if (activePartition.getMillis() < lastMigratedPartitionTime) { - logger.warn("Partition with active files is unexpectedly older than the most recently " - + "processed data. This means older data is being populated in the live data files. " - + "This should be resolved during the nightly OVERWRITE refresh process. " - + "Partition: " + activePartition); - } - } - - return inactivePartitions; - } -} diff --git a/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/RefreshKylin.java b/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/RefreshKylin.java deleted file mode 100644 index ae9f2195d..000000000 --- a/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/RefreshKylin.java +++ /dev/null @@ -1,439 +0,0 @@ -package apiumbrella.hadoop_analytics; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.URISyntaxException; -import java.net.URL; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.TreeSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.HttpMethod; -import org.apache.commons.httpclient.UsernamePasswordCredentials; -import org.apache.commons.httpclient.auth.AuthScope; -import org.apache.commons.httpclient.methods.GetMethod; -import org.apache.commons.httpclient.methods.PutMethod; -import org.apache.commons.httpclient.methods.RequestEntity; -import org.apache.commons.httpclient.methods.StringRequestEntity; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.joda.time.Days; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.ISODateTimeFormat; -import org.quartz.DisallowConcurrentExecution; -import org.quartz.Job; -import org.quartz.JobExecutionContext; -import org.quartz.JobExecutionException; -import org.quartz.PersistJobDataAfterExecution; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import com.google.gson.JsonParser; -import com.google.gson.JsonSyntaxException; - -@DisallowConcurrentExecution -@PersistJobDataAfterExecution -public class RefreshKylin implements Job { - final Logger logger; - private static final String KYLIN_URL = - System.getProperty("apiumbrella.kylin_url", "http://127.0.0.1:7070/kylin"); - private static final String KYLIN_USERNAME = - System.getProperty("apiumbrella.kylin_username", "ADMIN"); - private static final String KYLIN_PASSWORD = - System.getProperty("apiumbrella.kylin_password", "KYLIN"); - private static final String KYLIN_REFRESH_END_DATE = - System.getProperty("apiumbrella.kylin_refresh_end_date"); - private static final String CUBE_NAME = "logs_cube"; - DateTimeFormatter dateFormatter = ISODateTimeFormat.date(); - DateTimeFormatter hourMinuteFormatter = DateTimeFormat.forPattern("HH-mm"); - DateTimeFormatter dateTimeFormatter = - DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(App.TIMEZONE); - private HttpClient client; - private Connection connection; - - public RefreshKylin() { - logger = LoggerFactory.getLogger(this.getClass()); - logger.debug("Initializing " + this.getClass()); - } - - public void execute(JobExecutionContext context) throws JobExecutionException { - try { - connection = null; - try { - connection = App.getHiveConnection(); - run(); - } finally { - if (connection != null) { - connection.close(); - } - } - } catch (Exception e) { - logger.error("Refresh kylin error", e); - try { - Thread.sleep(30000); - } catch (InterruptedException e1) { - e1.printStackTrace(); - } - JobExecutionException jobError = new JobExecutionException(e); - jobError.refireImmediately(); - throw jobError; - } - } - - public void run() throws JsonSyntaxException, HttpException, IOException, SQLException, - ClassNotFoundException, IllegalArgumentException, URISyntaxException, InterruptedException { - logger.info("Begin kylin refresh"); - - URL url = new URL(KYLIN_URL); - client = new HttpClient(); - UsernamePasswordCredentials credentials = - new UsernamePasswordCredentials(KYLIN_USERNAME, KYLIN_PASSWORD); - AuthScope authScope = new AuthScope(url.getHost(), url.getPort(), AuthScope.ANY_REALM); - client.getState().setCredentials(authScope, credentials); - client.getParams().setAuthenticationPreemptive(true); - - /* Determine the last table partition with data in it. */ - DateTime lastPartitionDayStart = getPartitions(App.LOGS_TABLE_NAME).last(); - if (lastPartitionDayStart == null) { - logger.info("No data partitions exist, skipping kylin refresh"); - return; - } - - /* - * Determine how far Kylin should process data until. It should process until the most recent - * partition with data, but never process today's current data (we will defer processing each - * day until the day is over, so we don't have to worry about refreshing data). - */ - DateTime processUntilDayStart; - DateTime currentDayStart = - new DateTime(App.TIMEZONE).withTimeAtStartOfDay().withZoneRetainFields(DateTimeZone.UTC); - if (lastPartitionDayStart.isAfter(currentDayStart) - || lastPartitionDayStart.isEqual(currentDayStart)) { - processUntilDayStart = lastPartitionDayStart.minusDays(1); - } else { - processUntilDayStart = lastPartitionDayStart; - } - if (KYLIN_REFRESH_END_DATE != null) { - DateTimeFormatter dateParser = ISODateTimeFormat.dateParser().withZone(App.TIMEZONE); - processUntilDayStart = dateParser.parseDateTime(KYLIN_REFRESH_END_DATE); - } - logger.debug("Process until: " + processUntilDayStart + " (last partition: " - + lastPartitionDayStart + ")"); - DateTime processUntilDayEnd = processUntilDayStart.plusDays(1); - DateTime segmentStart = null; - DateTime segmentEnd = null; - - /* - * Before processing the last day of data, perform some additional data migrations to better - * optimize any of the daily partitions that were originally populated from the streaming live - * data. - */ - TreeSet livePartitions = getPartitions(App.LOGS_LIVE_TABLE_NAME); - for (DateTime livePartition : livePartitions) { - if (livePartition.isBefore(processUntilDayStart) || livePartition == processUntilDayStart) { - /* - * First, ensure the day's partition is done writing to (this should generally be the case, - * since this refresh kylin task is only called after midnight). - */ - waitForNoWriteActivityOnDayPartition(livePartition); - - /* - * Next, re-write the daily ORC partition with the live data. This better optimizes the data - * for long-term storage (less stripes than the data originally written from streaming - * appends). This also helps mitigate any potential edge-cases that may have led to the live - * data being missed during the live append process. - */ - overwriteDayPartitionFromLiveData(livePartition); - - /* - * Finally, remove the live data for this day, since the ORC data is now the definitive - * copy. - */ - removeLiveDataForDay(livePartition); - } - } - - /* Loop through the existing segments and see if any of them need to be refreshed. */ - JsonArray segments = getSegments(); - for (JsonElement segment : segments) { - JsonObject segmentObject = segment.getAsJsonObject(); - - /* Kill this run and wait for the next one if any segments are still processing. */ - String segmentName = segmentObject.get("name").getAsString(); - String segmentStatus = segmentObject.get("status").getAsString(); - if (!segmentStatus.equals("READY")) { - logger.info("Segment still processing, waiting for next refresh: " + segmentName + " - " - + segmentStatus); - return; - } - - segmentStart = - new DateTime(segmentObject.get("date_range_start").getAsLong(), DateTimeZone.UTC); - segmentEnd = new DateTime(segmentObject.get("date_range_end").getAsLong(), DateTimeZone.UTC); - DateTime segmentBuild = - new DateTime(segmentObject.get("last_build_time").getAsLong(), DateTimeZone.UTC); - - /* - * If the segment was built before the segment day is finished, then it needs to be refreshed. - * We'll also continue refreshing for up to 50 minutes after the day is finished to account - * for delayed data getting populated. - */ - if (segmentBuild.isBefore(segmentEnd.plusMinutes(50))) { - buildSegment("REFRESH", segmentStart, segmentEnd); - } - } - - /* - * Define the next segment to build: Either the day following the last existing segment, or the - * beginning of the cube if no existing segments exist. - */ - if (segmentStart == null) { - segmentStart = getCubeStart(); - } else { - segmentStart = new DateTime(segmentEnd); - } - - /* Loop over any segments that need building until the last partition day is hit. */ - while (segmentStart.isBefore(processUntilDayEnd)) { - /* - * Build a segment from the first day needed until the last partition day. But handle segments - * longer than 1 full day a bit differently. - */ - segmentEnd = new DateTime(processUntilDayEnd); - if (Days.daysBetween(segmentStart, segmentEnd).getDays() > 1) { - if (segmentStart.getYear() != segmentEnd.getYear()) { - /* - * If this segment would span separate years, then create separate segments for each - * calendar year. When first importing historical data, this helps the segment out the - * first big bulk processing. - */ - segmentEnd = segmentStart.withDayOfYear(1).plusYears(1); - } else { - /* - * Otherwise, if the segment would span multiple days, first create a segment until the - * second to last day. Then we'll create a separate segment for the final day. This helps - * ensure that the last day (which may contain live data), is put in its own segment, so - * it's quicker to re-process once the day is complete. - */ - segmentEnd = segmentEnd.minusDays(1); - } - } - - buildSegment("BUILD", segmentStart, segmentEnd); - segmentStart = segmentEnd; - } - - logger.info("Finish kylin refresh"); - } - - private JsonArray getSegments() throws HttpException, JsonSyntaxException, IOException { - GetMethod method = new GetMethod(KYLIN_URL + "/api/cubes/" + CUBE_NAME); - JsonObject result = makeRequest(method).getAsJsonObject(); - JsonArray segments = result.get("segments").getAsJsonArray(); - - return segments; - } - - private DateTime getCubeStart() throws HttpException, JsonSyntaxException, IOException { - GetMethod method = new GetMethod(KYLIN_URL + "/api/cube_desc/" + CUBE_NAME); - JsonObject result = makeRequest(method).getAsJsonArray().get(0).getAsJsonObject(); - DateTime start = new DateTime(result.get("partition_date_start").getAsLong(), DateTimeZone.UTC); - - return start; - } - - private TreeSet getPartitions(String tableName) - throws FileNotFoundException, IllegalArgumentException, IOException, URISyntaxException, - ClassNotFoundException, SQLException { - TreeSet partitions = new TreeSet(); - Pattern partitionDatePattern = Pattern.compile("timestamp_tz_date=(\\d{4}-\\d{2}-\\d{2})"); - PreparedStatement statement = connection.prepareStatement("SHOW PARTITIONS " + tableName); - ResultSet rs = statement.executeQuery(); - while (rs.next()) { - logger.debug(tableName + " partition: " + rs.getString("partition")); - Matcher matcher = partitionDatePattern.matcher(rs.getString("partition").toString()); - DateTime partitionTime = null; - while (matcher.find()) { - partitionTime = new DateTime(matcher.group(1) + "T00:00:00"); - } - - partitions.add(partitionTime); - } - - return partitions; - } - - private void waitForNoWriteActivityOnDayPartition(DateTime partition) - throws FileNotFoundException, IllegalArgumentException, IOException, URISyntaxException, - InterruptedException { - long inactiveTimeThreshold = new DateTime().minusMinutes(30).getMillis(); - long lastModifiedFile = 0; - do { - RemoteIterator filesIter = App.getHadoopFileSystem() - .listFiles(new Path(App.HDFS_URI + App.HDFS_LOGS_ROOT // - + "/timestamp_tz_year=" + dateFormatter.print(partition.withDayOfYear(1)) - + "/timestamp_tz_month=" + dateFormatter.print(partition.withDayOfMonth(1)) - + "/timestamp_tz_week=" + dateFormatter.print(partition.withDayOfWeek(1)) - + "/timestamp_tz_date=" + dateFormatter.print(partition)), true); - while (filesIter.hasNext()) { - FileStatus file = filesIter.next(); - logger.debug(file.getPath() + " file last modified: " + file.getModificationTime()); - if (file.getModificationTime() > lastModifiedFile) { - lastModifiedFile = file.getModificationTime(); - } - } - - if (lastModifiedFile > inactiveTimeThreshold) { - logger.info("Previous day partition has recently written data. " - + "Waiting until write activity hasn't occurred in 30 minutes. Last modification: " - + lastModifiedFile); - Thread.sleep(60000); - } - } while (lastModifiedFile == 0 || lastModifiedFile > inactiveTimeThreshold); - } - - private void overwriteDayPartitionFromLiveData(DateTime partition) - throws ClassNotFoundException, SQLException { - logger.info("Begin overwrite of partition " + partition + " from " + App.LOGS_LIVE_TABLE_NAME); - - PreparedStatement statement = connection.prepareStatement( - "SELECT COUNT(*) AS count FROM " + App.LOGS_TABLE_NAME + " WHERE timestamp_tz_date=?"); - statement.setString(1, dateFormatter.print(partition)); - ResultSet rs = statement.executeQuery(); - long beforePartitionCount = 0; - if (rs.next()) { - beforePartitionCount = rs.getLong("count"); - logger.debug("Partition (" + partition + ") count before overwrite: ", beforePartitionCount); - } - - PreparedStatement migrate = - connection.prepareStatement(ConvertLiveDataToOrc.getInsertOverwriteDaySql()); - migrate.setString(1, dateFormatter.print(partition.withDayOfYear(1))); - migrate.setString(2, dateFormatter.print(partition.withDayOfMonth(1))); - migrate.setString(3, dateFormatter.print(partition.withDayOfWeek(1))); - migrate.setString(4, dateFormatter.print(partition)); - migrate.setString(5, dateFormatter.print(partition)); - migrate.executeUpdate(); - migrate.close(); - - rs = statement.executeQuery(); - long afterPartitionCount = 0; - if (rs.next()) { - afterPartitionCount = rs.getLong("count"); - logger.debug("Partition (" + partition + ") count after overwrite: ", beforePartitionCount); - } - - if (beforePartitionCount != afterPartitionCount) { - logger.warn("Partition (" + partition + ") counts did not match"); - } - - logger.info("Finish overwrite of partition " + partition + " from " + App.LOGS_LIVE_TABLE_NAME); - } - - private void removeLiveDataForDay(DateTime partition) - throws SQLException, IllegalArgumentException, IOException, URISyntaxException { - logger.info("Removing "); - PreparedStatement addLivePartition = - connection.prepareStatement(ConvertLiveDataToOrc.getDropLivePartitionsDaySql()); - addLivePartition.setString(1, dateFormatter.print(partition)); - addLivePartition.setString(2, hourMinuteFormatter.print(partition)); - addLivePartition.executeUpdate(); - addLivePartition.close(); - - App.getHadoopFileSystem() - .delete(new Path(App.HDFS_URI + App.HDFS_LOGS_LIVE_ROOT // - + "/timestamp_tz_date=" + dateFormatter.print(partition) // - + "/timestamp_tz_hour_minute=" + hourMinuteFormatter.print(partition)), true); - } - - private void buildSegment(String buildType, DateTime start, DateTime end) - throws HttpException, JsonSyntaxException, IOException, ClassNotFoundException, - IllegalArgumentException, SQLException, URISyntaxException { - /* - * Before building any segments with Kylin, ensure that the table partitions are optimized by - * merging multiple files into 1. - */ - App.concatenateTablePartitions(logger); - - logger.info("Begin building segment (" + buildType + "): " + start + " - " + end); - - PutMethod method = new PutMethod(KYLIN_URL + "/api/cubes/" + CUBE_NAME + "/rebuild"); - - JsonObject data = new JsonObject(); - data.addProperty("buildType", buildType); - data.addProperty("startTime", start.getMillis()); - data.addProperty("endTime", end.getMillis()); - - RequestEntity entity = new StringRequestEntity(data.toString(), "application/json", "UTF-8"); - method.setRequestEntity(entity); - - JsonObject result = makeRequest(method).getAsJsonObject(); - String jobUuid = result.get("uuid").getAsString(); - waitForJob(jobUuid); - - logger.info("Finish building segment (" + buildType + "): " + start + " - " + end); - } - - private String getJobStatus(String jobUuid) - throws HttpException, JsonSyntaxException, IOException { - GetMethod method = new GetMethod(KYLIN_URL + "/api/jobs/" + jobUuid); - JsonObject result = makeRequest(method).getAsJsonObject(); - String status = result.get("job_status").getAsString(); - - logger.debug("Job status: " + jobUuid + ": " + status); - - return status; - } - - private void waitForJob(String jobUuid) throws HttpException, JsonSyntaxException, IOException { - try { - while (!getJobStatus(jobUuid).equals("FINISHED")) { - Thread.sleep(30000); - } - } catch (InterruptedException ex) { - Thread.currentThread().interrupt(); - } - } - - private JsonElement makeRequest(HttpMethod method) - throws HttpException, JsonSyntaxException, IOException { - JsonElement result = null; - try { - client.executeMethod(method); - int responseStatus = method.getStatusLine().getStatusCode(); - InputStreamReader responseBody = new InputStreamReader(method.getResponseBodyAsStream()); - if (responseStatus != 200) { - logger.error("Failed to make request: " + method.getURI() + " - " + responseStatus + " - " - + IOUtils.toString(responseBody)); - throw new HttpException("Unsuccessful HTTP response"); - } - result = new JsonParser().parse(responseBody); - } catch (HttpException e) { - throw e; - } catch (JsonSyntaxException e) { - throw e; - } catch (IOException e) { - throw e; - } finally { - method.releaseConnection(); - } - - return result; - } -} diff --git a/src/api-umbrella/hadoop-analytics/processor/src/main/resources/log4j.properties b/src/api-umbrella/hadoop-analytics/processor/src/main/resources/log4j.properties deleted file mode 100644 index 6286e1e20..000000000 --- a/src/api-umbrella/hadoop-analytics/processor/src/main/resources/log4j.properties +++ /dev/null @@ -1,6 +0,0 @@ -log4j.rootLogger=${apiumbrella.root_log_level},stdout -log4j.logger.apiumbrella=${apiumbrella.log_level} -log4j.appender.stdout=org.apache.log4j.ConsoleAppender -log4j.appender.stdout.Target=System.out -log4j.appender.stdout.layout=org.apache.log4j.PatternLayout -log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} [%p] %c{1} - %m%n \ No newline at end of file diff --git a/src/api-umbrella/hadoop-analytics/schema/.gitignore b/src/api-umbrella/hadoop-analytics/schema/.gitignore deleted file mode 100644 index 7e5babc1e..000000000 --- a/src/api-umbrella/hadoop-analytics/schema/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -/.classpath -/.project -/.settings -/dependency-reduced-pom.xml -/target/ diff --git a/src/api-umbrella/hadoop-analytics/schema/pom.xml b/src/api-umbrella/hadoop-analytics/schema/pom.xml deleted file mode 100644 index 73fd89030..000000000 --- a/src/api-umbrella/hadoop-analytics/schema/pom.xml +++ /dev/null @@ -1,33 +0,0 @@ - - 4.0.0 - - - api-umbrella.hadoop-analytics - hadoop-analytics - 0.0.1-SNAPSHOT - - schema - jar - - schema - http://maven.apache.org - - - UTF-8 - - - - ${buildDir}/schema - - - - - - org.apache.avro - avro - 1.8.0 - - - diff --git a/src/api-umbrella/hadoop-analytics/schema/src/main/java/apiumbrella/hadoop_analytics/LogSchema.java b/src/api-umbrella/hadoop-analytics/schema/src/main/java/apiumbrella/hadoop_analytics/LogSchema.java deleted file mode 100644 index e0068b8e9..000000000 --- a/src/api-umbrella/hadoop-analytics/schema/src/main/java/apiumbrella/hadoop_analytics/LogSchema.java +++ /dev/null @@ -1,139 +0,0 @@ -package apiumbrella.hadoop_analytics; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; - -import org.apache.avro.Schema; -import org.apache.avro.Schema.Type; - -public class LogSchema { - private Schema schema; - private HashMap fieldTypes; - private ArrayList fieldNames; - private ArrayList partitionFields = new ArrayList(); - private ArrayList nonPartitionFields = new ArrayList(); - private ArrayList livePartitionFields = new ArrayList(); - private ArrayList liveNonPartitionFields = new ArrayList(); - private HashSet shortFields = new HashSet(); - private HashSet dateFields = new HashSet(); - - public LogSchema() { - fieldTypes = new HashMap(); - fieldNames = new ArrayList(); - for (Schema.Field field : getSchema().getFields()) { - Schema.Type type = field.schema().getType(); - if (type == Schema.Type.UNION) { - for (Schema unionSchema : field.schema().getTypes()) { - if (unionSchema.getType() != Schema.Type.NULL) { - type = unionSchema.getType(); - break; - } - } - } - - fieldTypes.put(field.name(), type); - fieldNames.add(field.name()); - } - - // Explicitly define which fields we'll be partitioning by, since these - // don't need to be sorted in the output file (since they're part of the - // file path, it's duplicative to store this data in the file). - partitionFields.add("timestamp_tz_year"); - partitionFields.add("timestamp_tz_month"); - partitionFields.add("timestamp_tz_week"); - partitionFields.add("timestamp_tz_date"); - - livePartitionFields.add("timestamp_tz_date"); - livePartitionFields.add("timestamp_tz_hour_minute"); - fieldTypes.put("timestamp_tz_hour_minute", Schema.Type.STRING); - - for (String name : fieldNames) { - if (!partitionFields.contains(name)) { - nonPartitionFields.add(name); - } - - if (!livePartitionFields.contains(name)) { - liveNonPartitionFields.add(name); - } - } - - // Define fields we want to store as short/smallints. Since Avro doesn't - // support these in its schema, but ORC does, we need to explicitly list - // these. - shortFields.add("response_status"); - - dateFields.add("timestamp_tz_year"); - dateFields.add("timestamp_tz_month"); - dateFields.add("timestamp_tz_week"); - dateFields.add("timestamp_tz_date"); - } - - protected Schema getSchema() { - if (schema == null) { - InputStream is = Schema.class.getClassLoader().getResourceAsStream("log.avsc"); - try { - schema = new Schema.Parser().parse(is); - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); - } - } - return schema; - } - - protected Type getFieldType(String field) { - return fieldTypes.get(field); - } - - protected boolean isFieldTypeShort(String field) { - return shortFields.contains(field); - } - - protected String getFieldHiveType(String field) { - Type type = getFieldType(field); - if (type == Schema.Type.INT) { - if (isFieldTypeShort(field)) { - return "SMALLINT"; - } else { - return "INT"; - } - } else if (type == Schema.Type.LONG) { - return "BIGINT"; - } else if (type == Schema.Type.DOUBLE) { - return "DOUBLE"; - } else if (type == Schema.Type.BOOLEAN) { - return "BOOLEAN"; - } else if (type == Schema.Type.STRING) { - if (dateFields.contains(field)) { - return "DATE"; - } else { - return "STRING"; - } - } else { - return null; - } - } - - protected ArrayList getFieldNames() { - return fieldNames; - } - - protected ArrayList getPartitionFieldsList() { - return partitionFields; - } - - protected ArrayList getNonPartitionFieldsList() { - return nonPartitionFields; - } - - protected ArrayList getLivePartitionFieldsList() { - return livePartitionFields; - } - - protected ArrayList getLiveNonPartitionFieldsList() { - return liveNonPartitionFields; - } -} diff --git a/src/api-umbrella/hadoop-analytics/schema/src/main/resources/log.avsc b/src/api-umbrella/hadoop-analytics/schema/src/main/resources/log.avsc deleted file mode 100644 index 402b2bd18..000000000 --- a/src/api-umbrella/hadoop-analytics/schema/src/main/resources/log.avsc +++ /dev/null @@ -1,222 +0,0 @@ -{ - "type": "record", - "name": "Log", - "fields": [ - { - "name": "timestamp_utc", - "type": "long" - }, - { - "name": "id", - "type": "string" - }, - { - "name": "timestamp_tz_offset", - "type": "int" - }, - { - "name": "timestamp_tz_year", - "type": "string" - }, - { - "name": "timestamp_tz_month", - "type": "string" - }, - { - "name": "timestamp_tz_week", - "type": "string" - }, - { - "name": "timestamp_tz_date", - "type": "string" - }, - { - "name": "timestamp_tz_hour", - "type": "string" - }, - { - "name": "timestamp_tz_minute", - "type": "string" - }, - { - "name": "user_id", - "type": ["null", "string"] - }, - { - "name": "denied_reason", - "type": ["null", "string"] - }, - { - "name": "request_method", - "type": "string" - }, - { - "name": "request_url_scheme", - "type": "string" - }, - { - "name": "request_url_host", - "type": "string" - }, - { - "name": "request_url_port", - "type": "int" - }, - { - "name": "request_url_path", - "type": "string" - }, - { - "name": "request_url_path_level1", - "type": ["null", "string"] - }, - { - "name": "request_url_path_level2", - "type": ["null", "string"] - }, - { - "name": "request_url_path_level3", - "type": ["null", "string"] - }, - { - "name": "request_url_path_level4", - "type": ["null", "string"] - }, - { - "name": "request_url_path_level5", - "type": ["null", "string"] - }, - { - "name": "request_url_path_level6", - "type": ["null", "string"] - }, - { - "name": "request_url_query", - "type": ["null", "string"] - }, - { - "name": "request_ip", - "type": "string" - }, - { - "name": "request_ip_country", - "type": ["null", "string"] - }, - { - "name": "request_ip_region", - "type": ["null", "string"] - }, - { - "name": "request_ip_city", - "type": ["null", "string"] - }, - { - "name": "request_ip_lat", - "type": ["null", "double"] - }, - { - "name": "request_ip_lon", - "type": ["null", "double"] - }, - { - "name": "request_user_agent", - "type": ["null", "string"] - }, - { - "name": "request_user_agent_type", - "type": ["null", "string"] - }, - { - "name": "request_user_agent_family", - "type": ["null", "string"] - }, - { - "name": "request_size", - "type": ["null", "int"] - }, - { - "name": "request_accept", - "type": ["null", "string"] - }, - { - "name": "request_accept_encoding", - "type": ["null", "string"] - }, - { - "name": "request_content_type", - "type": ["null", "string"] - }, - { - "name": "request_connection", - "type": ["null", "string"] - }, - { - "name": "request_origin", - "type": ["null", "string"] - }, - { - "name": "request_referer", - "type": ["null", "string"] - }, - { - "name": "request_basic_auth_username", - "type": ["null", "string"] - }, - { - "name": "response_status", - "type": "int" - }, - { - "name": "response_content_type", - "type": ["null", "string"] - }, - { - "name": "response_content_length", - "type": ["null", "int"] - }, - { - "name": "response_content_encoding", - "type": ["null", "string"] - }, - { - "name": "response_transfer_encoding", - "type": ["null", "string"] - }, - { - "name": "response_server", - "type": ["null", "string"] - }, - { - "name": "response_cache", - "type": ["null", "string"] - }, - { - "name": "response_age", - "type": ["null", "int"] - }, - { - "name": "response_size", - "type": ["null", "int"] - }, - { - "name": "timer_response", - "type": ["null", "double"] - }, - { - "name": "timer_backend_response", - "type": ["null", "double"] - }, - { - "name": "timer_internal", - "type": ["null", "double"] - }, - { - "name": "timer_proxy_overhead", - "type": ["null", "double"] - }, - { - "name": "log_imported", - "type": ["null", "boolean"] - } - ] -} diff --git a/src/api-umbrella/web-app/Gemfile b/src/api-umbrella/web-app/Gemfile index 7d7cae2ba..6f7d8b713 100644 --- a/src/api-umbrella/web-app/Gemfile +++ b/src/api-umbrella/web-app/Gemfile @@ -18,9 +18,6 @@ gem "rack-proxy", "~> 0.6.4" gem "multi_json", "~> 1.13.1" gem "oj", "~> 3.6.0", :platforms => [:ruby] -# SQL escape libraries for Kylin analytics. -gem "sequel", "~> 5.8.0" - # MongoDB gem "mongoid", "~> 5.2.1" diff --git a/src/api-umbrella/web-app/Gemfile.lock b/src/api-umbrella/web-app/Gemfile.lock index 9e704a372..1231a4a58 100644 --- a/src/api-umbrella/web-app/Gemfile.lock +++ b/src/api-umbrella/web-app/Gemfile.lock @@ -282,7 +282,6 @@ GEM sprockets (>= 2.8, < 4.0) sprockets-rails (>= 2.0, < 4.0) tilt (>= 1.1, < 3) - sequel (5.8.0) simple_form (3.5.1) actionpack (> 4, < 5.2) activemodel (> 4, < 5.2) @@ -353,7 +352,6 @@ DEPENDENCIES rollbar (~> 2.15.5) safe_yaml (~> 1.0.4) sass-rails (~> 5.0) - sequel (~> 5.8.0) simple_form (~> 3.5.1) BUNDLED WITH diff --git a/src/api-umbrella/web-app/app/controllers/admin/sessions_controller.rb b/src/api-umbrella/web-app/app/controllers/admin/sessions_controller.rb index 4332f6ada..530e225cb 100644 --- a/src/api-umbrella/web-app/app/controllers/admin/sessions_controller.rb +++ b/src/api-umbrella/web-app/app/controllers/admin/sessions_controller.rb @@ -17,7 +17,6 @@ def auth if current_admin response.merge!({ "analytics_timezone" => ApiUmbrellaConfig[:analytics][:timezone], - "enable_beta_analytics" => (ApiUmbrellaConfig[:analytics][:adapter] == "kylin" || (ApiUmbrellaConfig[:analytics][:outputs] && ApiUmbrellaConfig[:analytics][:outputs].include?("kylin"))), "username_is_email" => ApiUmbrellaConfig[:web][:admin][:username_is_email], "local_auth_enabled" => ApiUmbrellaConfig[:web][:admin][:auth_strategies][:_enabled][:local], "password_length_min" => ApiUmbrellaConfig[:web][:admin][:password_length_min], diff --git a/src/api-umbrella/web-app/app/controllers/admin/stats_controller.rb b/src/api-umbrella/web-app/app/controllers/admin/stats_controller.rb index 62b4c8b0f..90baa8c77 100644 --- a/src/api-umbrella/web-app/app/controllers/admin/stats_controller.rb +++ b/src/api-umbrella/web-app/app/controllers/admin/stats_controller.rb @@ -37,16 +37,8 @@ def search end def logs - # TODO: For the SQL fetching, set start_time to end_time to limit to last - # 24 hours. If we do end up limiting it to the last 24 hours by default, - # figure out a better way to document this and still allow downloading - # the full data set. - start_time = params[:start_at] - if(@analytics_adapter == "kylin") - start_time = Time.zone.parse(params[:end_at]) - 1.day - end @search = LogSearch.factory(@analytics_adapter, { - :start_time => start_time, + :start_time => params[:start_at], :end_time => params[:end_at], :interval => params[:interval], }) diff --git a/src/api-umbrella/web-app/app/controllers/api/v0/analytics_controller.rb b/src/api-umbrella/web-app/app/controllers/api/v0/analytics_controller.rb index 7bc642e6c..25eab07cd 100644 --- a/src/api-umbrella/web-app/app/controllers/api/v0/analytics_controller.rb +++ b/src/api-umbrella/web-app/app/controllers/api/v0/analytics_controller.rb @@ -107,9 +107,9 @@ def generate_summary :end_time => Time.now.utc, :interval => "month", - # This query can take a long time to run against PrestoDB, so set a long - # timeout. But since we're only delivering cached results and refreshing - # periodically in the background, this long timeout should be okay. + # This query can take a long time to run, so set a long timeout. But + # since we're only delivering cached results and refreshing periodically + # in the background, this long timeout should be okay. :query_timeout => 20 * 60, # 20 minutes }) diff --git a/src/api-umbrella/web-app/app/controllers/application_controller.rb b/src/api-umbrella/web-app/app/controllers/application_controller.rb index 182f77980..86930ffad 100644 --- a/src/api-umbrella/web-app/app/controllers/application_controller.rb +++ b/src/api-umbrella/web-app/app/controllers/application_controller.rb @@ -91,11 +91,7 @@ def use_locale end def set_analytics_adapter - if(params[:beta_analytics] == "true") - @analytics_adapter = "kylin" - else - @analytics_adapter = ApiUmbrellaConfig[:analytics][:adapter] - end + @analytics_adapter = ApiUmbrellaConfig[:analytics][:adapter] end def set_time_zone diff --git a/src/api-umbrella/web-app/app/models/log_result.rb b/src/api-umbrella/web-app/app/models/log_result.rb index f5309e9ca..825f220e8 100644 --- a/src/api-umbrella/web-app/app/models/log_result.rb +++ b/src/api-umbrella/web-app/app/models/log_result.rb @@ -3,10 +3,6 @@ def self.factory(search, raw_result) case(search) when LogSearch::ElasticSearch LogResult::ElasticSearch.new(search, raw_result) - when LogSearch::Kylin - LogResult::Kylin.new(search, raw_result) - when LogSearch::Postgresql - LogResult::Postgresql.new(search, raw_result) end end end diff --git a/src/api-umbrella/web-app/app/models/log_result/kylin.rb b/src/api-umbrella/web-app/app/models/log_result/kylin.rb deleted file mode 100644 index 7e3ede752..000000000 --- a/src/api-umbrella/web-app/app/models/log_result/kylin.rb +++ /dev/null @@ -1,2 +0,0 @@ -class LogResult::Kylin < LogResult::Sql -end diff --git a/src/api-umbrella/web-app/app/models/log_result/postgresql.rb b/src/api-umbrella/web-app/app/models/log_result/postgresql.rb deleted file mode 100644 index a5a699ab5..000000000 --- a/src/api-umbrella/web-app/app/models/log_result/postgresql.rb +++ /dev/null @@ -1,2 +0,0 @@ -class LogResult::Postgresql < LogResult::Sql -end diff --git a/src/api-umbrella/web-app/app/models/log_result/sql.rb b/src/api-umbrella/web-app/app/models/log_result/sql.rb deleted file mode 100644 index 5b21e82b1..000000000 --- a/src/api-umbrella/web-app/app/models/log_result/sql.rb +++ /dev/null @@ -1,20 +0,0 @@ -class LogResult::Sql < LogResult::Base - def initialize(search, raw_result) - super - - if(search.result_processors.present?) - search.result_processors.each do |processor| - processor.call(self) - end - end - end - - def column_indexes(query_name) - column_indexes = {} - raw_result[query_name]["columnMetas"].each_with_index do |meta, index| - column_indexes[meta["label"].downcase] = index - end - - column_indexes - end -end diff --git a/src/api-umbrella/web-app/app/models/log_search.rb b/src/api-umbrella/web-app/app/models/log_search.rb index 7ffbd9b3b..f0a2288be 100644 --- a/src/api-umbrella/web-app/app/models/log_search.rb +++ b/src/api-umbrella/web-app/app/models/log_search.rb @@ -3,10 +3,6 @@ def self.factory(adapter, options = {}) case(adapter) when "elasticsearch" LogSearch::ElasticSearch.new(options) - when "kylin" - LogSearch::Kylin.new(options) - when "postgresql" - LogSearch::Postgresql.new(options) end end end diff --git a/src/api-umbrella/web-app/app/models/log_search/kylin.rb b/src/api-umbrella/web-app/app/models/log_search/kylin.rb deleted file mode 100644 index 4ae5c176c..000000000 --- a/src/api-umbrella/web-app/app/models/log_search/kylin.rb +++ /dev/null @@ -1,112 +0,0 @@ -class LogSearch::Kylin < LogSearch::Sql - def execute_kylin(sql) - @kylin_conn ||= Faraday.new(:url => "#{ApiUmbrellaConfig[:kylin][:protocol]}://#{ApiUmbrellaConfig[:kylin][:host]}:#{ApiUmbrellaConfig[:kylin][:port]}") do |faraday| - faraday.response :logger - faraday.adapter Faraday.default_adapter - faraday.basic_auth "ADMIN", "KYLIN" - end - if(ApiUmbrellaConfig[:kylin][:auth]) - @kylin_conn.basic_auth(ApiUmbrellaConfig[:kylin][:auth][:username], ApiUmbrellaConfig[:kylin][:auth][:password]) - end - - Rails.logger.info(sql) - response = @kylin_conn.post do |req| - req.url "/kylin/api/query" - req.headers["Content-Type"] = "application/json" - req.body = MultiJson.dump({ - :acceptPartial => false, - :project => "api_umbrella", - :sql => sql, - }) - end - - if(response.status != 200) - Rails.logger.error(response.body) - raise "Kylin Error" - end - - MultiJson.load(response.body) - end - - def execute_presto(sql) - @presto_conn ||= Faraday.new(:url => "#{ApiUmbrellaConfig[:presto][:protocol]}://#{ApiUmbrellaConfig[:presto][:host]}:#{ApiUmbrellaConfig[:presto][:port]}") do |faraday| - faraday.response :logger - faraday.adapter Faraday.default_adapter - end - if(ApiUmbrellaConfig[:presto][:auth]) - @presto_conn.basic_auth(ApiUmbrellaConfig[:presto][:auth][:username], ApiUmbrellaConfig[:presto][:auth][:password]) - end - - Rails.logger.info(sql) - response = @presto_conn.post do |req| - req.url "/v1/statement" - req.headers["Content-Type"] = "text/plain" - req.headers["X-Presto-User"] = "presto" - req.headers["X-Presto-Catalog"] = "hive" - req.headers["X-Presto-Schema"] = "default" - if(@options[:query_timeout]) - req.headers["X-Presto-Session"] = "query_max_run_time=#{@options[:query_timeout]}s" - end - req.body = sql - end - - results = { - "columnMetas" => [], - "results" => [], - } - while(response.status == 200) - query_result = MultiJson.load(response.body) - if(query_result["stats"] && query_result["stats"]["state"] == "FAILED") - Rails.logger.error(response.body) - raise "Presto Error" - end - - if(results["columnMetas"].empty? && query_result["columns"]) - results["columnMetas"] = query_result["columns"].map do |column| - { - "label" => column["name"], - } - end - end - - if(query_result["data"].present?) - results["results"] += query_result["data"] - end - - if(query_result["nextUri"]) - response = @presto_conn.get do |req| - req.url(query_result["nextUri"]) - end - else - break - end - end - - if(response.status != 200) - Rails.logger.error(response.body) - raise "Presto Error" - end - - results - end - - def execute_query(query_name, query = {}) - unless @query_results[query_name] - sql = build_query(query) - - if(@needs_presto) - results = execute_presto(sql) - else - begin - results = execute_kylin(sql) - rescue - results = execute_presto(sql) - end - end - - @query_results[query_name] = results - end - - @query_results[query_name] - end -end diff --git a/src/api-umbrella/web-app/app/models/log_search/postgresql.rb b/src/api-umbrella/web-app/app/models/log_search/postgresql.rb deleted file mode 100644 index 227a1bec1..000000000 --- a/src/api-umbrella/web-app/app/models/log_search/postgresql.rb +++ /dev/null @@ -1,2 +0,0 @@ -class LogSearch::Postgresql < LogSearch::Sql -end diff --git a/src/api-umbrella/web-app/app/models/log_search/sql.rb b/src/api-umbrella/web-app/app/models/log_search/sql.rb deleted file mode 100644 index cd44e846a..000000000 --- a/src/api-umbrella/web-app/app/models/log_search/sql.rb +++ /dev/null @@ -1,926 +0,0 @@ -class LogSearch::Sql < LogSearch::Base - attr_reader :result_processors - - NOT_NULL_FIELDS = [ - "request_ip", - ].freeze - - LEGACY_FIELDS = { - "backend_response_time" => "timer_backend_response", - "gatekeeper_denied_code" => "denied_reason", - "imported" => "log_imported", - "internal_gatekeeper_time" => "timer_internal", - "proxy_overhead" => "timer_proxy_overhead", - "request_at" => "timestamp_utc", - "request_host" => "request_url_host", - "request_path" => "request_url_path", - "request_scheme" => "request_url_scheme", - "response_time" => "timer_response", - }.freeze - - FIELD_TYPES = { - "response_status" => :int, - }.freeze - - def initialize(options = {}) - super - - @sequel = Sequel.connect("mock://postgresql") - - case(@interval) - when "minute" - @interval_field = "timestamp_tz_minute" - @interval_field_format = "%Y-%m-%d %H:%M:%S" - when "hour" - @interval_field = "timestamp_tz_hour" - @interval_field_format = "%Y-%m-%d %H:%M:%S" - when "day" - @interval_field = "timestamp_tz_date" - @interval_field_format = "%Y-%m-%d" - when "week" - @interval_field = "timestamp_tz_week" - @interval_field_format = "%Y-%m-%d" - when "month" - @interval_field = "timestamp_tz_month" - @interval_field_format = "%Y-%m-%d" - end - - @query = { - :select => [], - :where => [], - :group_by => [], - :order_by => [], - } - - @queries = {} - @query_results = {} - - @query_options = { - :size => 0, - :ignore_unavailable => "missing", - :allow_no_indices => true, - } - - @result_processors = [] - end - - def build_query(query) - select = @query[:select] + (query[:select] || []) - sql = "SELECT #{select.join(", ")} FROM api_umbrella.logs" - - where = @query[:where] + (query[:where] || []) - if(where.present?) - sql << " WHERE #{where.map { |clause| "(#{clause})" }.join(" AND ")}" - end - - group_by = @query[:group_by] + (query[:group_by] || []) - if(group_by.present?) - sql << " GROUP BY #{group_by.join(", ")}" - end - - order_by = @query[:order_by] + (query[:order_by] || []) - if(order_by.present?) - sql << " ORDER BY #{order_by.join(", ")}" - end - - limit = query[:limit] || @query[:limit] - if(limit.present?) - sql << " LIMIT #{limit}" - end - - sql - end - - def result - if @none - @result = LogResult.factory(self, {}) - return @result - end - - if(@query_results.empty? || @queries[:default]) - execute_query(:default, @queries[:default] || {}) - end - - @result = LogResult.factory(self, @query_results) - end - - def permission_scope!(scopes) - filter = [] - scopes["rules"].each do |rule| - filter << parse_query_builder(rule) - end - - @query[:where] << filter.join(" OR ") - end - - def search_type!(search_type) - if(search_type == "count") - @queries[:default] ||= {} - @queries[:default][:select] ||= [] - @queries[:default][:select] << "COUNT(*) AS total_count" - - @result_processors << proc do |result| - count = 0 - column_indexes = result.column_indexes(:default) - result.raw_result[:default]["results"].each do |row| - count = row[column_indexes["total_count"]].to_i - end - - result.raw_result["hits"] ||= {} - result.raw_result["hits"]["total"] = count - end - end - end - - def search!(query_string) - if(query_string.present?) - parser = LuceneQueryParser::Parser.new - - # LuceneQueryParser doesn't support the "!" alias for NOT. This is a - # quick hack, but we should address in the gem for better parsing - # accuracy (otherwise, this could replace ! inside strings). - query_string.gsub!("!", " NOT ") - - data = parser.parse(query_string) - - query = { - "condition" => "AND", - "rules" => [], - } - - data.each do |expression| - rule = { - "id" => expression[:field].to_s, - "field" => expression[:field].to_s, - "operator" => "equal", - "value" => expression[:term].to_s, - } - - if(expression[:term]) - rule["value"] = expression[:term].to_s - if(rule["value"].end_with?("*")) - rule["value"].gsub!(/\*$/, "") - rule["operator"] = "begins_with" - elsif(rule["value"].start_with?("*")) - rule["value"].gsub!(/^\*/, "") - rule["operator"] = "ends_with" - else - rule["operator"] = "equal" - end - - if(expression[:op].to_s == "NOT") - rule["operator"] = "not_#{rule["operator"]}" - end - - query["rules"] << rule - elsif(expression[:inclusive_range]) - query["rules"] << rule.merge({ - "operator" => "greater_or_equal", - "value" => expression[:inclusive_range][:from].to_s, - }) - - query["rules"] << rule.merge({ - "operator" => "less_or_equal", - "value" => expression[:inclusive_range][:to].to_s, - }) - elsif(expression[:exclusive_range]) - query["rules"] << rule.merge({ - "operator" => "greater", - "value" => expression[:exclusive_range][:from].to_s, - }) - - query["rules"] << rule.merge({ - "operator" => "less", - "value" => expression[:exclusive_range][:to].to_s, - }) - end - end - - filter = parse_query_builder(query) - if(filter.present?) - @query[:where] << filter - end - end - end - - def query!(query) - if(query.kind_of?(String) && query.present?) - query = MultiJson.load(query) - end - - filter = parse_query_builder(query) - if(filter.present?) - @query[:where] << filter - end - end - - def parse_query_builder(query) - query_filter = nil - - if(query.present?) - filters = [] - query["rules"].each do |rule| - field = rule["field"] - if(LEGACY_FIELDS[field]) - field = LEGACY_FIELDS[field] - end - - filter = nil - operator = nil - value = rule["value"] - - if(!CASE_SENSITIVE_FIELDS.include?(rule["field"]) && value.kind_of?(String)) - # FIXME: Is this needed now that everything is case-sensitive in SQL? - # value.downcase! - end - - if(value.present?) - case(FIELD_TYPES[field]) - when :int - value = Integer(value) - when :double - value = Float(value) - end - end - - case(field) - when "request_method" - value.upcase! - end - - case(rule["operator"]) - when "equal" - operator = "=" - when "not_equal" - # Use IS DISTINCT FROM instead of <> (aka !=), since we want to treat - # NULL values as not equal to the given value. - operator = "IS DISTINCT FROM" - when "begins_with" - operator = "LIKE" - value = "#{value}%" - when "not_begins_with" - operator = "NOT LIKE" - value = "#{value}%" - when "ends_with" - operator = "LIKE" - value = "%#{value}" - when "not_ends_with" - operator = "NOT LIKE" - value = "%#{value}" - when "contains" - operator = "LIKE" - value = "%#{value}%" - when "not_contains" - operator = "NOT LIKE" - value = "%#{value}%" - when "is_null" - operator = "IS NULL" - value = nil - when "is_not_null" - operator = "IS NOT NULL" - value = nil - when "less" - operator = "<" - when "less_or_equal" - operator = "<=" - when "greater" - operator = ">" - when "greater_or_equal" - operator = ">=" - when "between" - values = rule["value"].map { |v| v.to_f }.sort - filter = "#{@sequel.quote_identifier(field)} >= #{@sequel.literal(values[0])} AND #{@sequel.quote_identifier(field)} <= #{@sequel.literal(values[1])}" - else - raise "unknown filter operator: #{rule["operator"]} (rule: #{rule.inspect})" - end - - if(field == "request_url_path" && ["equal", "not_equal", "begins_with", "not_begins_with"].include?(rule["operator"])) - level_filters = [] - levels = value.split(%r{/+}, 6).reject { |l| l.blank? } - levels.each_with_index do |level, index| - level_field = "request_url_path_level#{index + 1}" - - level_value = level.dup - if(index == 0) - level_value = "/#{level_value}" - end - if(index < levels.length - 1) - level_value = "#{level_value}/" - end - - if(index < levels.length - 1) - if(["not_equal", "not_begins_with"].include?(rule["operator"])) - level_operator = "IS DISTINCT FROM" - else - level_operator = "=" - end - else - level_operator = operator - end - - level_filter = "#{@sequel.quote_identifier(level_field)} #{level_operator}" - unless(level_value.nil?) - level_filter << " #{@sequel.literal(level_value)}" - end - - level_filters << level_filter - end - - filter = level_filters.join(" AND ") - end - - unless(filter) - filter = "#{@sequel.quote_identifier(field)} #{operator}" - unless(value.nil?) - filter << " #{@sequel.literal(value)}" - end - end - - filters << filter - end - - if(filters.present?) - where = filters.map { |w| "(#{w})" } - if(query["condition"] == "OR") - query_filter = where.join(" OR ") - else - query_filter = where.join(" AND ") - end - end - end - - query_filter - end - - def offset!(from) - @query_options[:from] = from - end - - def limit!(size) - @query[:limit] = size - end - - def sort!(sort) - sort.each do |s| - field = s.keys.first - order = s.values.first - if(order == "desc" || order == "asc") - order = order.upcase - else - order = "ASC" - end - - @query[:order_by] << "#{@sequel.quote_identifier(field)} #{order}" - end - - @query[:sort] = sort - end - - def exclude_imported! - @query[:where] << "log_imported IS DISTINCT FROM true" - end - - def filter_by_date_range! - @query[:where] << @sequel.literal(Sequel.lit("timestamp_tz_date >= CAST(:start_time_date AS DATE) AND timestamp_tz_date <= CAST(:end_time_date AS DATE)", { - :start_time_date => @start_time.strftime("%Y-%m-%d"), - :end_time_date => @end_time.strftime("%Y-%m-%d"), - })) - end - - def filter_by_request_path!(request_path) - @query[:query][:filtered][:filter][:bool][:must] << { - :term => { - :request_path => request_path, - }, - } - end - - def filter_by_api_key!(api_key) - user = ApiUser.where(:api_key => api_key).first - @query[:where] << @sequel.literal(Sequel.lit("user_id = ?", user.id)) - end - - def filter_by_user!(user_email) - user = ApiUser.where(:email => user_email).first - @query[:where] << @sequel.literal(Sequel.lit("user_id = ?", user.id)) - end - - def filter_by_user_ids!(user_ids) - @query[:where] << @sequel.literal(Sequel.lit("user_id IN ?", user_ids)) - end - - def aggregate_by_drilldown!(prefix, size = 0) - @drilldown_prefix_segments = prefix.split("/") - @drilldown_depth = @drilldown_prefix_segments[0].to_i - - # Define the hierarchy of fields that will be involved in this query given - # the depth. - @drilldown_fields = ["request_url_host"] - (1..@drilldown_depth).each do |i| - @drilldown_fields << "request_url_path_level#{i}" - end - @drilldown_depth_field = @drilldown_fields.last - - # Define common parts of the query for all drilldown queries. - @drilldown_common_query = { - :select => ["COUNT(*) AS hits"], - :where => [], - :group_by => [], - } - @drilldown_fields.each_with_index do |field, index| - # If we're grouping by the top-level of the hierarchy (the hostname), we - # need to perform some custom logic to determine whether the hostname has - # any children data. - # - # For all the request_url_path_level* fields, we store the value with a - # trailing slash if there's further parts of the hierarchy. This gives us - # an easy way to determine whether the specific level is the terminating - # level of the hierarchy. In other words, this gives us a way to - # distinguish traffic ending in /foo versus /foo/bar (where /foo is a - # parent level). Since the host field doesn't follow this convention, we - # must emulate it here by looking at the level1 path to see if it's NULL - # or NOT NULL, and append the appropriate slash. - if(@drilldown_depth == 0) - @drilldown_common_query[:select] << "request_url_host || CASE WHEN request_url_path_level1 IS NULL THEN '' ELSE '/' END AS request_url_host" - @drilldown_common_query[:group_by] << "request_url_host, CASE WHEN request_url_path_level1 IS NULL THEN '' ELSE '/' END" - else - @drilldown_common_query[:select] << field - @drilldown_common_query[:group_by] << field - end - @drilldown_common_query[:where] << "#{field} IS NOT NULL" - - # Match all the parts of the host and path that are part of the prefix. - prefix_match_value = @drilldown_prefix_segments[index + 1] - if prefix_match_value - # Since we're only interested in matching the parent values, all of the - # request_url_path_level* fields should have trailing slashes (since - # that denotes that there's child data). request_url_path_level1 should - # also have a slash prefix (since it's the beginning of the path). - if(index == 1) - prefix_match_value = "/#{prefix_match_value}/" - elsif(index > 1) - prefix_match_value = "#{prefix_match_value}/" - end - @drilldown_common_query[:where] << @sequel.literal(Sequel.lit("#{field} = ?", prefix_match_value)) - end - end - - execute_query(:drilldown, { - :select => @drilldown_common_query[:select], - :where => @drilldown_common_query[:where], - :group_by => @drilldown_common_query[:group_by], - :order_by => ["hits DESC"], - }) - - # Massage the query into the aggregation format matching our old - # elasticsearch queries. - @result_processors << proc do |result| - buckets = [] - column_indexes = result.column_indexes(:drilldown) - result.raw_result[:drilldown]["results"].each do |row| - buckets << { - "key" => build_drilldown_prefix_from_result(row, column_indexes), - "doc_count" => row[column_indexes["hits"]].to_i, - } - end - - result.raw_result["aggregations"] ||= {} - result.raw_result["aggregations"]["drilldown"] ||= {} - result.raw_result["aggregations"]["drilldown"]["buckets"] = buckets - end - end - - def aggregate_by_drilldown_over_time!(prefix) - # Grab the top 10 paths out of the query previously done inside - # #aggregate_by_drilldown! - # - # A sub-query would probably be more efficient, but I'm still experiencing - # this error in Kylin 1.2: https://issues.apache.org/jira/browse/KYLIN-814 - top_paths = [] - top_path_indexes = {} - column_indexes = result.column_indexes(:drilldown) - @query_results[:drilldown]["results"][0, 10].each_with_index do |row, index| - value = row[column_indexes[@drilldown_depth_field]] - top_path_indexes[value] = index - if(@drilldown_depth == 0) - value = value.chomp("/") - end - top_paths << value - end - - # Get a date-based breakdown of the traffic to the top 10 paths. - top_path_where = [] - if(top_paths.any?) - top_path_where << @sequel.literal(Sequel.lit("#{@drilldown_depth_field} IN ?", top_paths)) - end - execute_query(:top_path_hits_over_time, { - :select => @drilldown_common_query[:select] + ["#{@interval_field} AS interval_field"], - :where => @drilldown_common_query[:where] + top_path_where, - :group_by => @drilldown_common_query[:group_by] + [@interval_field], - :order_by => ["interval_field"], - }) - - # Get a date-based breakdown of the traffic to all paths. This is used to - # come up with how much to allocate to the "Other" category (by subtracting - # away the top 10 traffic). This probably isn't the best way to go about - # this in SQL-land, but this is how we did things in ElasticSearch, so for - # now, keep with that same approach. - # - # Since we want a sum of all the traffic, we need to remove the last level - # of group by and selects (since that would give us per-path breakdowns, - # and we only want totals). - all_drilldown_select = @drilldown_common_query[:select][0..-2] - all_drilldown_group_by = @drilldown_common_query[:group_by][0..-2] - execute_query(:hits_over_time, { - :select => all_drilldown_select + ["#{@interval_field} AS interval_field"], - :where => @drilldown_common_query[:where], - :group_by => all_drilldown_group_by + [@interval_field], - :order_by => ["interval_field"], - }) - - # Massage the top_path_hits_over_time query into the aggregation format - # matching our old elasticsearch queries. - @result_processors << proc do |result| - buckets = [] - column_indexes = result.column_indexes(:top_path_hits_over_time) - result.raw_result[:top_path_hits_over_time]["results"].each do |row| - # Store the hierarchy breakdown in the order of overall traffic (so the - # path with the most traffic is always at the bottom of the graph). - path_index = top_path_indexes[row[column_indexes[@drilldown_depth_field]]] - - # If the path index isn't set, skip this result for top hit processing. - # This can happen when we're grouping by the top-level host, since our - # query to match results returns everything matching the top hostnames, - # however we may only be considering "example.com/" vs "example.com" as - # part of the top hits. - next unless(path_index) - - unless buckets[path_index] - buckets[path_index] = { - "key" => build_drilldown_prefix_from_result(row, column_indexes), - "doc_count" => 0, - "drilldown_over_time" => { - "time_buckets" => {}, - }, - } - end - - hits = row[column_indexes["hits"]].to_i - time = strptime_in_zone(row[column_indexes["interval_field"]], @interval_field_format) - - buckets[path_index]["doc_count"] += hits - buckets[path_index]["drilldown_over_time"]["time_buckets"][time.to_i] = { - "key" => time.to_i * 1000, - "key_as_string" => time.utc.iso8601, - "doc_count" => hits, - } - end - - buckets.each do |bucket| - bucket["drilldown_over_time"]["buckets"] = fill_in_time_buckets(bucket["drilldown_over_time"].delete("time_buckets")) - end - - result.raw_result["aggregations"] ||= {} - result.raw_result["aggregations"]["top_path_hits_over_time"] ||= {} - result.raw_result["aggregations"]["top_path_hits_over_time"]["buckets"] = buckets - end - - # Massage the hits_over_time query into the aggregation format matching our - # old elasticsearch queries. - @result_processors << proc do |result| - time_buckets = {} - column_indexes = result.column_indexes(:hits_over_time) - result.raw_result[:hits_over_time]["results"].each do |row| - time = strptime_in_zone(row[column_indexes["interval_field"]], @interval_field_format) - time_buckets[time.to_i] = { - "key" => time.to_i * 1000, - "key_as_string" => time.utc.iso8601, - "doc_count" => row[column_indexes["hits"]].to_i, - } - end - - result.raw_result["aggregations"] ||= {} - result.raw_result["aggregations"]["hits_over_time"] ||= {} - result.raw_result["aggregations"]["hits_over_time"]["buckets"] = fill_in_time_buckets(time_buckets) - end - end - - def aggregate_by_interval! - execute_query(:hits_over_time, { - :select => ["COUNT(*) AS hits", "#{@interval_field} AS interval_field"], - :group_by => [@interval_field], - :order_by => ["interval_field"], - }) - - # Massage the hits_over_time query into the aggregation format matching our - # old elasticsearch queries. - @result_processors << proc do |result| - time_buckets = {} - column_indexes = result.column_indexes(:hits_over_time) - result.raw_result[:hits_over_time]["results"].each do |row| - time = strptime_in_zone(row[column_indexes["interval_field"]], @interval_field_format) - time_buckets[time.to_i] = { - "key" => time.to_i * 1000, - "key_as_string" => time.utc.iso8601, - "doc_count" => row[column_indexes["hits"]].to_i, - } - end - - result.raw_result["aggregations"] ||= {} - result.raw_result["aggregations"]["hits_over_time"] ||= {} - result.raw_result["aggregations"]["hits_over_time"]["buckets"] = fill_in_time_buckets(time_buckets) - end - end - - def aggregate_by_region_field!(field) - @query[:aggregations] ||= {} - @query[:aggregations][:regions] = { - :terms => { - :field => field.to_s, - }, - } - - field = field.to_s - @query[:select] << "COUNT(*) AS hits" - @query[:select] << @sequel.quote_identifier(field) - @query[:group_by] << @sequel.quote_identifier(field) - - @result_processors << proc do |result| - buckets = [] - null_count = 0 - column_indexes = result.column_indexes(:default) - result.raw_result[:default]["results"].each do |row| - region = row[column_indexes[field]] - hits = row[column_indexes["hits"]].to_i - if(region.nil?) - null_count = hits - else - buckets << { - "key" => region, - "doc_count" => hits, - } - end - end - - result.raw_result["aggregations"] ||= {} - result.raw_result["aggregations"]["regions"] ||= {} - result.raw_result["aggregations"]["regions"]["buckets"] = buckets - - result.raw_result["aggregations"]["missing_regions"] ||= {} - result.raw_result["aggregations"]["missing_regions"]["doc_count"] = null_count - end - end - - def aggregate_by_country_regions!(country) - @query[:where] << @sequel.literal(Sequel.lit("request_ip_country = ?", country)) - aggregate_by_region_field!(:request_ip_region) - end - - def aggregate_by_us_state_cities!(country, state) - @query[:where] << @sequel.literal(Sequel.lit("request_ip_country = ?", country)) - @query[:where] << @sequel.literal(Sequel.lit("request_ip_region = ?", state)) - aggregate_by_region_field!(:request_ip_city) - end - - def aggregate_by_country_cities!(country) - @query[:where] << @sequel.literal(Sequel.lit("request_ip_country = ?", country)) - aggregate_by_region_field!(:request_ip_city) - end - - def aggregate_by_term!(field, size) - field = field.to_s - - query_name_top = :"aggregate_by_term_#{field}_top" - execute_query(query_name_top, { - :select => ["COUNT(*) AS hits", @sequel.quote_identifier(field)], - :where => ["#{@sequel.quote_identifier(field)} IS NOT NULL"], - :group_by => [@sequel.quote_identifier(field)], - :order_by => ["hits DESC"], - :limit => size, - }) - - query_name_count = :"aggregate_by_term_#{field}_count" - execute_query(query_name_count, { - :select => ["COUNT(*) AS hits"], - }) - - query_name_null_count = :"aggregate_by_term_#{field}_null_count" - # Optimization: Skip IS NULL query for any NOT NULL columns, since it - # will always be 0. - if(!NOT_NULL_FIELDS.include?(field)) - execute_query(query_name_null_count, { - :select => ["COUNT(*) AS hits"], - :where => ["#{@sequel.quote_identifier(field)} IS NULL"], - }) - end - - @result_processors << proc do |result| - buckets = [] - column_indexes = result.column_indexes(query_name_top) - result.raw_result[query_name_top]["results"].each do |row| - buckets << { - "key" => row[column_indexes[field]], - "doc_count" => row[column_indexes["hits"]].to_i, - } - end - - result.raw_result["aggregations"] ||= {} - result.raw_result["aggregations"]["top_#{field.pluralize}"] ||= {} - result.raw_result["aggregations"]["top_#{field.pluralize}"]["buckets"] = buckets - end - - @result_processors << proc do |result| - count = 0 - column_indexes = result.column_indexes(query_name_count) - result.raw_result[query_name_count]["results"].each do |row| - count = row[column_indexes["hits"]].to_i - end - - result.raw_result["aggregations"] ||= {} - result.raw_result["aggregations"]["value_count_#{field.pluralize}"] ||= {} - result.raw_result["aggregations"]["value_count_#{field.pluralize}"]["value"] = count - end - - @result_processors << proc do |result| - count = 0 - # Still populate the NOT NULL fields with a 0 value for compatibility. - if(!NOT_NULL_FIELDS.include?(field)) - column_indexes = result.column_indexes(query_name_null_count) - result.raw_result[query_name_null_count]["results"].each do |row| - count = row[column_indexes["hits"]].to_i - end - end - - result.raw_result["aggregations"] ||= {} - result.raw_result["aggregations"]["missing_#{field.pluralize}"] ||= {} - result.raw_result["aggregations"]["missing_#{field.pluralize}"]["doc_count"] = count - end - end - - def aggregate_by_cardinality!(field) - field = field.to_s - @queries[:default] ||= {} - @queries[:default][:select] ||= [] - @queries[:default][:select] << "COUNT(DISTINCT #{@sequel.quote_identifier(field)}) AS #{@sequel.quote_identifier("#{field}_distinct_count")}" - - @result_processors << proc do |result| - count = 0 - column_indexes = result.column_indexes(:default) - result.raw_result[:default]["results"].each do |row| - count = row[column_indexes["#{field}_distinct_count"]].to_i - end - - result.raw_result["aggregations"] ||= {} - result.raw_result["aggregations"]["unique_#{field.pluralize}"] ||= {} - result.raw_result["aggregations"]["unique_#{field.pluralize}"]["value"] = count - end - end - - def aggregate_by_users!(size) - aggregate_by_term!(:user_id, size) - aggregate_by_cardinality!(:user_id) - - @result_processors << proc do |result| - result.raw_result["aggregations"]["missing_user_emails"] = result.raw_result["aggregations"].delete("missing_user_ids") - result.raw_result["aggregations"]["top_user_emails"] = result.raw_result["aggregations"].delete("top_user_ids") - result.raw_result["aggregations"]["unique_user_emails"] = result.raw_result["aggregations"].delete("unique_user_ids") - result.raw_result["aggregations"]["value_count_user_emails"] = result.raw_result["aggregations"].delete("value_count_user_ids") - - user_ids = result.raw_result["aggregations"]["top_user_emails"]["buckets"].map { |bucket| bucket["key"] } - users_by_id = ApiUser.where(:id.in => user_ids).group_by { |u| u.id } - result.raw_result["aggregations"]["top_user_emails"]["buckets"].each do |bucket| - user = users_by_id[bucket["key"]] - if(user && user.first) - bucket["key"] = user.first.email - end - end - end - end - - def aggregate_by_request_ip!(size) - aggregate_by_term!(:request_ip, size) - aggregate_by_cardinality!(:request_ip) - end - - def aggregate_by_user_stats!(options = {}) - @query[:select] << "COUNT(*) AS hits" - @query[:select] << "MAX(timestamp_utc) AS last_request_at" - @query[:select] << "user_id" - @query[:group_by] << "user_id" - - if(options[:order]) - if(options[:order]["_count"] == "asc") - @query[:order_by] << "hits ASC" - elsif(options[:order]["_count"] == "desc") - @query[:order_by] << "hits DESC" - elsif(options[:order]["last_request_at"] == "asc") - @query[:order_by] << "last_request_at ASC" - elsif(options[:order]["last_request_at"] == "desc") - @query[:order_by] << "last_request_at DESC" - end - end - - @result_processors << proc do |result| - buckets = [] - column_indexes = result.column_indexes(:default) - result.raw_result[:default]["results"].each do |row| - buckets << { - "key" => row[column_indexes["user_id"]], - "doc_count" => row[column_indexes["hits"]].to_i, - "last_request_at" => { - "value" => row[column_indexes["last_request_at"]].to_f, - "value_as_string" => Time.at(row[column_indexes["last_request_at"]].to_i / 1000.0).iso8601, - }, - } - end - - result.raw_result["aggregations"] ||= {} - result.raw_result["aggregations"]["user_stats"] ||= {} - result.raw_result["aggregations"]["user_stats"]["buckets"] = buckets - end - end - - def aggregate_by_response_time_average! - @queries[:default] ||= {} - @queries[:default][:select] ||= [] - @queries[:default][:select] << "AVG(timer_response) AS average_timer_response" - - @result_processors << proc do |result| - average = 0 - column_indexes = result.column_indexes(:default) - result.raw_result[:default]["results"].each do |row| - average = row[column_indexes["average_timer_response"]].to_f - end - - result.raw_result["aggregations"] ||= {} - result.raw_result["aggregations"]["response_time_average"] ||= {} - result.raw_result["aggregations"]["response_time_average"]["value"] = average - end - end - - def select_records! - @needs_presto = true - @query[:select] << "*" - - @result_processors << proc do |result| - hits = [] - columns = result.raw_result[:default]["columnMetas"].map { |m| m["label"].downcase } - result.raw_result[:default]["results"].each do |row| - data = Hash[columns.zip(row)] - data["request_url"] = "#{data["request_url_scheme"]}://#{data["request_url_host"]}:#{data["request_url_port"]}#{data["request_url_path"]}?#{data["request_url_query"]}" - hits << { "_source" => data } - end - - result.raw_result["hits"] ||= {} - result.raw_result["hits"]["total"] = hits.length - result.raw_result["hits"]["hits"] = hits - end - end - - private - - def fill_in_time_buckets(time_buckets) - time = @start_time - case @interval - when "minute" - time = time.change(:sec => 0) - else - time = time.send(:"beginning_of_#{@interval}") - end - - buckets = [] - while(time <= @end_time) - time_bucket = time_buckets[time.to_i] - time_bucket ||= { - "key" => time.to_i * 1000, - "key_as_string" => time.utc.iso8601, - "doc_count" => 0, - } - - buckets << time_bucket - - time += 1.send(:"#{@interval}") - end - - buckets - end - - def build_drilldown_prefix_from_result(row, column_indexes) - key = [@drilldown_depth.to_s] - @drilldown_fields.each do |field| - key << row[column_indexes[field]] - end - File.join(key) - end - - # Perform strptime in the current timezone. Based on time-zone aware strptime - # that's present in Rails 5 (but not Rails 3): - # https://github.com/rails/rails/pull/19618 - # - # If the current timezone is US Eastern, this allows for parsing something - # like "2016-03-21" into "2016-03-21 04:00:00 UTC" - def strptime_in_zone(string, format) - t = Time.strptime(string, format) - Time.zone.local(t.year, t.month, t.mday, t.hour, t.min, t.sec) - end -end diff --git a/templates/etc/flume.properties.mustache b/templates/etc/flume.properties.mustache deleted file mode 100644 index 021f9c414..000000000 --- a/templates/etc/flume.properties.mustache +++ /dev/null @@ -1,42 +0,0 @@ -api-umbrella-agent.sources = kafka-source -api-umbrella-agent.channels = file-channel -api-umbrella-agent.sinks = hdfs-sink - -# Use file storage for buffering data. -api-umbrella-agent.channels.file-channel.type = file -api-umbrella-agent.channels.file-channel.checkpointDir = {{db_dir}}/flume/file-channel/checkpoint -api-umbrella-agent.channels.file-channel.dataDirs = {{db_dir}}/flume/file-channel/data -api-umbrella-agent.channels.file-channel.use-fast-replay = true - -# Pull data from Kafka (which rsyslog is pumping data into). -api-umbrella-agent.sources.kafka-source.type = org.apache.flume.source.kafka.KafkaSource -api-umbrella-agent.sources.kafka-source.channels = file-channel -api-umbrella-agent.sources.kafka-source.zookeeperConnect = 127.0.0.1:2181 -api-umbrella-agent.sources.kafka-source.groupId = api-umbrella-flume -api-umbrella-agent.sources.kafka-source.topic = api_umbrella_logs - -# Extract the timestamp_utc from the JSON data. -api-umbrella-agent.sources.kafka-source.interceptors = timestamp-extractor -api-umbrella-agent.sources.kafka-source.interceptors.timestamp-extractor.type = regex_extractor -api-umbrella-agent.sources.kafka-source.interceptors.timestamp-extractor.regex = "timestamp_utc": ?(\\d+) -api-umbrella-agent.sources.kafka-source.interceptors.timestamp-extractor.serializers = timestamp-serializer -api-umbrella-agent.sources.kafka-source.interceptors.timestamp-extractor.serializers.timestamp-serializer.type = org.apache.flume.interceptor.RegexExtractorInterceptorPassThroughSerializer -api-umbrella-agent.sources.kafka-source.interceptors.timestamp-extractor.serializers.timestamp-serializer.name = timestamp - -# Write data to HDFS. -api-umbrella-agent.sinks.hdfs-sink.type = hdfs -api-umbrella-agent.sinks.hdfs-sink.channel = file-channel -api-umbrella-agent.sinks.hdfs-sink.hdfs.path = /apps/api-umbrella/logs-live/timestamp_tz_date=%Y-%m-%d/timestamp_tz_hour_minute=%H-%M -# Use "_" prefix for files currently being written to. This prevent Hive from -# picking these files up before they're fully written (otherwise, the partial -# gzip state results in invalid data). -api-umbrella-agent.sinks.hdfs-sink.hdfs.inUsePrefix = _ -# Write compressed JSON files. -api-umbrella-agent.sinks.hdfs-sink.hdfs.fileType = CompressedStream -api-umbrella-agent.sinks.hdfs-sink.hdfs.codeC = gzip -api-umbrella-agent.sinks.hdfs-sink.hdfs.fileSuffix = .json.gz -# Write out the file ever 15 seconds. -api-umbrella-agent.sinks.hdfs-sink.hdfs.rollInterval = 15 -api-umbrella-agent.sinks.hdfs-sink.hdfs.rollSize = 0 -api-umbrella-agent.sinks.hdfs-sink.hdfs.rollCount = 0 -api-umbrella-agent.sinks.hdfs-sink.hdfs.timeZone = {{analytics.timezone}} diff --git a/templates/etc/kylin/kylin.properties b/templates/etc/kylin/kylin.properties deleted file mode 100644 index 9a861dbfe..000000000 --- a/templates/etc/kylin/kylin.properties +++ /dev/null @@ -1,174 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# kylin server's mode -kylin.server.mode=all - -# optional information for the owner of kylin platform, it can be your team's email -# currently it will be attached to each kylin's htable attribute -kylin.owner=whoami@kylin.apache.org - -# List of web servers in use, this enables one web server instance to sync up with other servers. -kylin.rest.servers=localhost:7070 - -# The metadata store in hbase -kylin.metadata.url=kylin_metadata@hbase - -# The storage for final cube file in hbase -kylin.storage.url=hbase - -# Temp folder in hdfs, make sure user has the right access to the hdfs directory -kylin.hdfs.working.dir=/kylin - -# HBase Cluster FileSystem, which serving hbase, format as hdfs://hbase-cluster:8020 -# leave empty if hbase running on same cluster with hive and mapreduce -kylin.hbase.cluster.fs= - -kylin.job.mapreduce.default.reduce.input.mb=500 - -# max job retry on error, default 0: no retry -kylin.job.retry=0 - -# If true, job engine will not assume that hadoop CLI reside on the same server as it self -# you will have to specify kylin.job.remote.cli.hostname, kylin.job.remote.cli.username and kylin.job.remote.cli.password -# It should not be set to "true" unless you're NOT running Kylin.sh on a hadoop client machine -# (Thus kylin instance has to ssh to another real hadoop client machine to execute hbase,hive,hadoop commands) -kylin.job.run.as.remote.cmd=false - -# Only necessary when kylin.job.run.as.remote.cmd=true -kylin.job.remote.cli.hostname= - -# Only necessary when kylin.job.run.as.remote.cmd=true -kylin.job.remote.cli.username= - -# Only necessary when kylin.job.run.as.remote.cmd=true -kylin.job.remote.cli.password= - -# Used by test cases to prepare synthetic data for sample cube -kylin.job.remote.cli.working.dir=/tmp/kylin - -# Max count of concurrent jobs running -kylin.job.concurrent.max.limit=10 - -# Time interval to check hadoop job status -kylin.job.yarn.app.rest.check.interval.seconds=10 - -# Hive database name for putting the intermediate flat tables -kylin.job.hive.database.for.intermediatetable=default - -#default compression codec for htable,snappy,lzo,gzip,lz4 -kylin.hbase.default.compression.codec=snappy - -#the percentage of the sampling, default 100% -kylin.job.cubing.inmem.sampling.percent=100 - -# The cut size for hbase region, in GB. -kylin.hbase.region.cut=5 - -# The hfile size of GB, smaller hfile leading to the converting hfile MR has more reducers and be faster -# set 0 to disable this optimization -kylin.hbase.hfile.size.gb=2 - -# Enable/disable ACL check for cube query -kylin.query.security.enabled=true - -# whether get job status from resource manager with kerberos authentication -kylin.job.status.with.kerberos=false - -## kylin security configurations - -# spring security profile, options: testing, ldap, saml -# with "testing" profile, user can use pre-defined name/pwd like KYLIN/ADMIN to login -kylin.security.profile=testing - -# default roles and admin roles in LDAP, for ldap and saml -acl.defaultRole=ROLE_ANALYST,ROLE_MODELER -acl.adminRole=ROLE_ADMIN - -#LDAP authentication configuration -ldap.server=ldap://ldap_server:389 -ldap.username= -ldap.password= - -#LDAP user account directory; -ldap.user.searchBase= -ldap.user.searchPattern= -ldap.user.groupSearchBase= - -#LDAP service account directory -ldap.service.searchBase= -ldap.service.searchPattern= -ldap.service.groupSearchBase= - -#SAML configurations for SSO -# SAML IDP metadata file location -saml.metadata.file=classpath:sso_metadata.xml -saml.metadata.entityBaseURL=https://hostname/kylin -saml.context.scheme=https -saml.context.serverName=hostname -saml.context.serverPort=443 -saml.context.contextPath=/kylin - - -ganglia.group= -ganglia.port=8664 - -## Config for mail service - -# If true, will send email notification; -mail.enabled=false -mail.host= -mail.username= -mail.password= -mail.sender= - -###########################config info for web####################### - -#help info ,format{name|displayName|link} ,optional -kylin.web.help.length=4 -kylin.web.help.0=start|Getting Started| -kylin.web.help.1=odbc|ODBC Driver| -kylin.web.help.2=tableau|Tableau Guide| -kylin.web.help.3=onboard|Cube Design Tutorial| - -#guide user how to build streaming cube -kylin.web.streaming.guide=http://kylin.apache.org/ - -#hadoop url link ,optional -kylin.web.hadoop= -#job diagnostic url link ,optional -kylin.web.diagnostic= -#contact mail on web page ,optional -kylin.web.contact_mail= - -###########################config info for front####################### - -#env DEV|QA|PROD -deploy.env=QA - -###########################deprecated configs####################### -kylin.sandbox=true -kylin.web.hive.limit=20 -# The cut size for hbase region, -#in GB. -# E.g, for cube whose capacity be marked as "SMALL", split region per 5GB by default -kylin.hbase.region.cut.small=5 -kylin.hbase.region.cut.medium=10 -kylin.hbase.region.cut.large=50 - -# Custom Config -kylin.rest.timezone=GMT diff --git a/templates/etc/kylin/kylin_hive_conf.xml b/templates/etc/kylin/kylin_hive_conf.xml deleted file mode 100644 index 3682c4760..000000000 --- a/templates/etc/kylin/kylin_hive_conf.xml +++ /dev/null @@ -1,94 +0,0 @@ - - - - dfs.replication - 1 - Block replication - - - - dfs.block.size - 33554432 - 32MB, want more mappers for in-mem cubing, thus smaller the DFS block size - - - - hive.exec.compress.output - true - enable compress - - - - hive.auto.convert.join.noconditionaltask - true - enable map-side join - - - - hive.auto.convert.join.noconditionaltask.size - 300000000 - enable map-side join - - - - mapreduce.map.output.compress.codec - org.apache.hadoop.io.compress.SnappyCodec - - - - mapreduce.output.fileoutputformat.compress.codec - org.apache.hadoop.io.compress.SnappyCodec - - - - - hive.merge.mapfiles - true - Enable hive file merge on mapper only job - - - hive.merge.mapredfiles - true - Enable hive file merge on map-reduce job - - - mapred.output.compression.type - BLOCK - The compression type to use for job outputs - - - hive.merge.size.per.task - 256000000 - Size for the merged file - - - - hive.support.concurrency - false - Hive concurrency lock - - - - mapreduce.job.split.metainfo.maxsize - -1 - The maximum permissible size of the split metainfo file. - The JobTracker won't attempt to read split metainfo files bigger than - the configured value. No limits if set to -1. - - - diff --git a/templates/etc/kylin/kylin_job_conf.xml b/templates/etc/kylin/kylin_job_conf.xml deleted file mode 100644 index fd89584d5..000000000 --- a/templates/etc/kylin/kylin_job_conf.xml +++ /dev/null @@ -1,113 +0,0 @@ - - - - - mapreduce.job.split.metainfo.maxsize - -1 - The maximum permissible size of the split metainfo file. - The JobTracker won't attempt to read split metainfo files bigger than - the configured value. No limits if set to -1. - - - - - mapred.compress.map.output - true - Compress map outputs - - - - mapred.map.output.compression.codec - org.apache.hadoop.io.compress.SnappyCodec - The compression codec to use for map outputs - - - - - mapred.output.compress - true - Compress the output of a MapReduce job - - - - mapred.output.compression.codec - org.apache.hadoop.io.compress.SnappyCodec - The compression codec to use for job outputs - - - - - mapred.output.compression.type - BLOCK - The compression type to use for job outputs - - - - - mapreduce.job.max.split.locations - 2000 - No description - - - - dfs.replication - 1 - Block replication - - - - mapred.task.timeout - 3600000 - Set task timeout to 1 hour - - - - mapreduce.map.memory.mb - 1024 - - - - - mapreduce.map.java.opts - -Xmx750m - - - - - - mapreduce.map.memory.mb - 3072 - - - - - mapreduce.map.java.opts - -Xmx2700m - - - - - mapreduce.task.io.sort.mb - 200 - - - - diff --git a/templates/etc/kylin/kylin_job_conf_inmem.xml b/templates/etc/kylin/kylin_job_conf_inmem.xml deleted file mode 100644 index ec4a202c0..000000000 --- a/templates/etc/kylin/kylin_job_conf_inmem.xml +++ /dev/null @@ -1,98 +0,0 @@ - - - - - mapreduce.job.split.metainfo.maxsize - -1 - The maximum permissible size of the split metainfo file. - The JobTracker won't attempt to read split metainfo files bigger than - the configured value. No limits if set to -1. - - - - - mapred.compress.map.output - true - Compress map outputs - - - - mapred.map.output.compression.codec - org.apache.hadoop.io.compress.SnappyCodec - The compression codec to use for map outputs - - - - - mapred.output.compress - true - Compress the output of a MapReduce job - - - - mapred.output.compression.codec - org.apache.hadoop.io.compress.SnappyCodec - The compression codec to use for job outputs - - - - - mapred.output.compression.type - BLOCK - The compression type to use for job outputs - - - - - mapreduce.job.max.split.locations - 2000 - No description - - - - dfs.replication - 1 - Block replication - - - - mapred.task.timeout - 3600000 - Set task timeout to 1 hour - - - - - mapreduce.map.memory.mb - 3072 - - - - - mapreduce.map.java.opts - -Xmx2700m - - - - - mapreduce.task.io.sort.mb - 200 - - - - diff --git a/templates/etc/perp/flume/rc.env.mustache b/templates/etc/perp/flume/rc.env.mustache deleted file mode 100644 index 045b49ba8..000000000 --- a/templates/etc/perp/flume/rc.env.mustache +++ /dev/null @@ -1 +0,0 @@ -JAVA_OPTS={{flume.embedded_server_config.java_opts}} diff --git a/templates/etc/perp/flume/rc.log b/templates/etc/perp/flume/rc.log deleted file mode 100755 index 089baa65a..000000000 --- a/templates/etc/perp/flume/rc.log +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -exec ../rc.log "$@" diff --git a/templates/etc/perp/flume/rc.main.mustache b/templates/etc/perp/flume/rc.main.mustache deleted file mode 100755 index ae252bcfd..000000000 --- a/templates/etc/perp/flume/rc.main.mustache +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash - -# Redirect stderr to stdout -exec 2>&1 - -if [ "${1}" = "start" ]; then - echo "starting ${2}..." - api_umbrella_user="{{user}}" - - run_args=("-e" "rc.env") - if [ -n "$api_umbrella_user" ]; then - run_args+=("-u" "$api_umbrella_user") - fi - - exec runtool "${run_args[@]}" "{{_embedded_root_dir}}/flume/bin/flume-ng" \ - agent \ - --name api-umbrella-agent \ - --conf "{{_embedded_root_dir}}/flume/conf" \ - --conf-file "{{etc_dir}}/flume.properties" \ - -Dflume.root.logger=INFO,console -fi - -exit 0 diff --git a/templates/etc/perp/kylin/rc.env.mustache b/templates/etc/perp/kylin/rc.env.mustache deleted file mode 100644 index fa4994032..000000000 --- a/templates/etc/perp/kylin/rc.env.mustache +++ /dev/null @@ -1,4 +0,0 @@ -KYLIN_HOME={{_embedded_root_dir}}/kylin -# HBase maxsize must be set higher in Kylin 1.5+ to avoid HBase KeyValue size -# too large errors. -KYLIN_CUSTOM_START_OPTS=-Dkylin.hbase.client.keyvalue.maxsize=30485760 -DKYLIN_CONF={{etc_dir}}/kylin diff --git a/templates/etc/perp/kylin/rc.log b/templates/etc/perp/kylin/rc.log deleted file mode 100755 index 089baa65a..000000000 --- a/templates/etc/perp/kylin/rc.log +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -exec ../rc.log "$@" diff --git a/templates/etc/perp/kylin/rc.main.mustache b/templates/etc/perp/kylin/rc.main.mustache deleted file mode 100755 index 7e8852f01..000000000 --- a/templates/etc/perp/kylin/rc.main.mustache +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env bash - -# Redirect stderr to stdout -exec 2>&1 - -if [ "${1}" = "start" ]; then - echo "starting ${2}..." - api_umbrella_user="{{user}}" - - run_args=("-e" "rc.env") - if [ -n "$api_umbrella_user" ]; then - run_args+=("-u" "$api_umbrella_user") - fi - - exec runtool "${run_args[@]}" "{{etc_dir}}/perp/kylin/rc.run" -fi - -exit 0 diff --git a/templates/etc/perp/kylin/rc.run.mustache b/templates/etc/perp/kylin/rc.run.mustache deleted file mode 100755 index 3979bbeeb..000000000 --- a/templates/etc/perp/kylin/rc.run.mustache +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash - -# A script to start Kylin, but in the foreground. This is basically a copy of -# the normal Kylin start script -# (version 1.2: https://github.com/apache/kylin/blob/kylin-1.2/bin/kylin.sh), -# but modified so that it doesn't background the kylin process. This provides a -# perp-compatible way to start Kylin (since it requires foreground processes). -# -# We should see about submitting a "run" pull request to Kylin to provide -# foreground based starting directly in Kylin so we can get rid of this script. - -dir="{{_embedded_root_dir}}/kylin/bin" -source ${dir}/check-env.sh -mkdir -p ${KYLIN_HOME}/logs - -tomcat_root=${dir}/../tomcat -export tomcat_root - -rm -rf ${tomcat_root}/webapps/kylin - -spring_profile=`sh ${dir}/get-properties.sh kylin.security.profile` -if [ -z "$spring_profile" ] -then - echo 'please set kylin.security.profile in kylin.properties, options are: testing, ldap, saml.' - exit 1 -else - echo "kylin.security.profile is set to $spring_profile" -fi - -source ${dir}/find-hive-dependency.sh -source ${dir}/find-hbase-dependency.sh -if [ -f "${dir}/setenv.sh" ] - then source ${dir}/setenv.sh -fi - -export HBASE_CLASSPATH_PREFIX=${tomcat_root}/bin/bootstrap.jar:${tomcat_root}/bin/tomcat-juli.jar:${tomcat_root}/lib/*:$HBASE_CLASSPATH_PREFIX -mkdir -p ${KYLIN_HOME}/ext -export HBASE_CLASSPATH=$hive_dependency:${KYLIN_HOME}/lib/*:${KYLIN_HOME}/ext/*:${HBASE_CLASSPATH} - -exec \ - hbase \ - ${KYLIN_EXTRA_START_OPTS} \ - ${KYLIN_CUSTOM_START_OPTS} \ - -Djava.util.logging.config.file=${tomcat_root}/conf/logging.properties \ - -Djava.util.logging.manager=org.apache.juli.ClassLoaderLogManager \ - -Dorg.apache.tomcat.util.buf.UDecoder.ALLOW_ENCODED_SLASH=true \ - -Dorg.apache.catalina.connector.CoyoteAdapter.ALLOW_BACKSLASH=true \ - -Djava.endorsed.dirs=${tomcat_root}/endorsed \ - -Dcatalina.base=${tomcat_root} \ - -Dcatalina.home=${tomcat_root} \ - -Djava.io.tmpdir=${tomcat_root}/temp \ - -Dkylin.hive.dependency=${hive_dependency} \ - -Dkylin.hbase.dependency=${hbase_dependency} \ - -Dspring.profiles.active=${spring_profile} \ - org.apache.hadoop.util.RunJar ${tomcat_root}/bin/bootstrap.jar org.apache.catalina.startup.Bootstrap start diff --git a/templates/etc/perp/presto/rc.log b/templates/etc/perp/presto/rc.log deleted file mode 100755 index 089baa65a..000000000 --- a/templates/etc/perp/presto/rc.log +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -exec ../rc.log "$@" diff --git a/templates/etc/perp/presto/rc.main.mustache b/templates/etc/perp/presto/rc.main.mustache deleted file mode 100755 index 7f8bf8512..000000000 --- a/templates/etc/perp/presto/rc.main.mustache +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Redirect stderr to stdout -exec 2>&1 - -if [ "${1}" = "start" ]; then - echo "starting ${2}..." - api_umbrella_user="{{user}}" - - run_args=() - if [ -n "$api_umbrella_user" ]; then - run_args+=("-u" "$api_umbrella_user") - fi - - exec runtool "${run_args[@]}" "{{_embedded_root_dir}}/presto/bin/launcher" run \ - --node-config="{{etc_dir}}/presto/node.properties" \ - --jvm-config="{{etc_dir}}/presto/jvm.config" \ - --config="{{etc_dir}}/presto/config.properties" \ - --log-levels-file="{{etc_dir}}/presto/log.properties" -fi - -exit 0 diff --git a/templates/etc/presto/catalog/hive.properties.mustache b/templates/etc/presto/catalog/hive.properties.mustache deleted file mode 100644 index 25a3291d7..000000000 --- a/templates/etc/presto/catalog/hive.properties.mustache +++ /dev/null @@ -1,2 +0,0 @@ -connector.name=hive-hadoop2 -hive.metastore.uri={{presto.embedded_server_config.hive.metastore_uri}} diff --git a/templates/etc/presto/config.properties.mustache b/templates/etc/presto/config.properties.mustache deleted file mode 100644 index c08431cf4..000000000 --- a/templates/etc/presto/config.properties.mustache +++ /dev/null @@ -1,12 +0,0 @@ -coordinator=true -node-scheduler.include-coordinator=true -http-server.http.port={{presto.embedded_server_config.http_port}} -http-server.log.path={{log_dir}}/presto/http-request.log -query.max-memory={{presto.embedded_server_config.max_memory}} -query.max-memory-per-node={{presto.embedded_server_config.max_memory_per_node}} -discovery-server.enabled=true -discovery.uri={{presto.embedded_server_config.discovery_uri}} -catalog.config-dir={{etc_dir}}/presto/catalog - -# Limit how long individual queries can run. -query.max-run-time={{presto.embedded_server_config.query_max_run_time}} diff --git a/templates/etc/presto/jvm.config.mustache b/templates/etc/presto/jvm.config.mustache deleted file mode 100644 index ac29e837b..000000000 --- a/templates/etc/presto/jvm.config.mustache +++ /dev/null @@ -1,8 +0,0 @@ --server --Xmx2G --XX:+UseG1GC --XX:G1HeapRegionSize=32M --XX:+UseGCOverheadLimit --XX:+ExplicitGCInvokesConcurrent --XX:+HeapDumpOnOutOfMemoryError --XX:OnOutOfMemoryError=kill -9 %p diff --git a/templates/etc/presto/log.properties.mustache b/templates/etc/presto/log.properties.mustache deleted file mode 100644 index 1c52627f6..000000000 --- a/templates/etc/presto/log.properties.mustache +++ /dev/null @@ -1 +0,0 @@ -com.facebook.presto=INFO diff --git a/templates/etc/presto/node.properties.mustache b/templates/etc/presto/node.properties.mustache deleted file mode 100644 index f82801ed1..000000000 --- a/templates/etc/presto/node.properties.mustache +++ /dev/null @@ -1,3 +0,0 @@ -node.environment=production -node.id=ffffffff-ffff-ffff-ffff-ffffffffffff -node.data-dir={{db_dir}}/presto diff --git a/templates/etc/rsyslog.conf.mustache b/templates/etc/rsyslog.conf.mustache index 03baa2faf..62c267e5d 100644 --- a/templates/etc/rsyslog.conf.mustache +++ b/templates/etc/rsyslog.conf.mustache @@ -226,181 +226,6 @@ local0.info action( ) {{/analytics._output_elasticsearch?}} -{{#analytics._output_kylin?}} -module(load="omkafka") - -# Define the SQL-based output. -if($!raw!denied_reason != "") then { - set $!usr!sql!denied_reason = $!raw!denied_reason; -} -if($!raw!id != "") then { - set $!usr!sql!id = $!raw!id; -} -if($!raw!request_accept != "") then { - set $!usr!sql!request_accept = $!raw!request_accept; -} -if($!raw!request_accept_encoding != "") then { - set $!usr!sql!request_accept_encoding = $!raw!request_accept_encoding; -} -if($!raw!request_basic_auth_username != "") then { - set $!usr!sql!request_basic_auth_username = $!raw!request_basic_auth_username; -} -if($!raw!request_connection != "") then { - set $!usr!sql!request_connection = $!raw!request_connection; -} -if($!raw!request_content_type != "") then { - set $!usr!sql!request_content_type = $!raw!request_content_type; -} -if($!raw!request_ip != "") then { - set $!usr!sql!request_ip = $!raw!request_ip; -} -if($!raw!request_ip_city != "") then { - set $!usr!sql!request_ip_city = $!raw!request_ip_city; -} -if($!raw!request_ip_country != "") then { - set $!usr!sql!request_ip_country = $!raw!request_ip_country; -} -if($!raw!request_ip_lat != "") then { - set $!usr!sql!request_ip_lat = $!raw!request_ip_lat; -} -if($!raw!request_ip_lon != "") then { - set $!usr!sql!request_ip_lon = $!raw!request_ip_lon; -} -if($!raw!request_ip_region != "") then { - set $!usr!sql!request_ip_region = $!raw!request_ip_region; -} -if($!raw!request_method != "") then { - set $!usr!sql!request_method = $!raw!request_method; -} -if($!raw!request_origin != "") then { - set $!usr!sql!request_origin = $!raw!request_origin; -} -if($!raw!request_referer != "") then { - set $!usr!sql!request_referer = $!raw!request_referer; -} -if($!raw!request_size != "") then { - set $!usr!sql!request_size = $!raw!request_size; -} -if($!raw!request_url_host != "") then { - set $!usr!sql!request_url_host = $!raw!request_url_host; -} -if($!raw!request_url_path != "") then { - set $!usr!sql!request_url_path = $!raw!request_url_path; -} -if($!raw!request_url_path_level1 != "") then { - set $!usr!sql!request_url_path_level1 = $!raw!request_url_path_level1; -} -if($!raw!request_url_path_level2 != "") then { - set $!usr!sql!request_url_path_level2 = $!raw!request_url_path_level2; -} -if($!raw!request_url_path_level3 != "") then { - set $!usr!sql!request_url_path_level3 = $!raw!request_url_path_level3; -} -if($!raw!request_url_path_level4 != "") then { - set $!usr!sql!request_url_path_level4 = $!raw!request_url_path_level4; -} -if($!raw!request_url_path_level5 != "") then { - set $!usr!sql!request_url_path_level5 = $!raw!request_url_path_level5; -} -if($!raw!request_url_path_level6 != "") then { - set $!usr!sql!request_url_path_level6 = $!raw!request_url_path_level6; -} -if($!raw!request_url_query != "") then { - set $!usr!sql!request_url_query = $!raw!request_url_query; -} -if($!raw!request_url_scheme != "") then { - set $!usr!sql!request_url_scheme = $!raw!request_url_scheme; -} -if($!raw!request_user_agent != "") then { - set $!usr!sql!request_user_agent = $!raw!request_user_agent; -} -if($!raw!request_user_agent_family != "") then { - set $!usr!sql!request_user_agent_family = $!raw!request_user_agent_family; -} -if($!raw!request_user_agent_type != "") then { - set $!usr!sql!request_user_agent_type = $!raw!request_user_agent_type; -} -if($!raw!response_age != "") then { - set $!usr!sql!response_age = $!raw!response_age; -} -if($!raw!response_cache != "") then { - set $!usr!sql!response_cache = $!raw!response_cache; -} -if($!raw!response_content_encoding != "") then { - set $!usr!sql!response_content_encoding = $!raw!response_content_encoding; -} -if($!raw!response_content_length != "") then { - set $!usr!sql!response_content_length = $!raw!response_content_length; -} -if($!raw!response_content_type != "") then { - set $!usr!sql!response_content_type = $!raw!response_content_type; -} -if($!raw!response_server != "") then { - set $!usr!sql!response_server = $!raw!response_server; -} -if($!raw!response_size != "") then { - set $!usr!sql!response_size = $!raw!response_size; -} -if($!raw!response_status != "") then { - set $!usr!sql!response_status = $!raw!response_status; -} -if($!raw!response_transfer_encoding != "") then { - set $!usr!sql!response_transfer_encoding = $!raw!response_transfer_encoding; -} -if($!raw!timer_response != "") then { - set $!usr!sql!timer_response = $!raw!timer_response; -} -if($!raw!timestamp_tz_date != "") then { - set $!usr!sql!timestamp_tz_date = $!raw!timestamp_tz_date; -} -if($!raw!timestamp_tz_hour != "") then { - set $!usr!sql!timestamp_tz_hour = $!raw!timestamp_tz_hour; -} -if($!raw!timestamp_tz_minute != "") then { - set $!usr!sql!timestamp_tz_minute = $!raw!timestamp_tz_minute; -} -if($!raw!timestamp_tz_month != "") then { - set $!usr!sql!timestamp_tz_month = $!raw!timestamp_tz_month; -} -if($!raw!timestamp_tz_offset != "") then { - set $!usr!sql!timestamp_tz_offset = $!raw!timestamp_tz_offset; -} -if($!raw!timestamp_tz_week != "") then { - set $!usr!sql!timestamp_tz_week = $!raw!timestamp_tz_week; -} -if($!raw!timestamp_tz_year != "") then { - set $!usr!sql!timestamp_tz_year = $!raw!timestamp_tz_year; -} -if($!raw!timestamp_utc != "") then { - set $!usr!sql!timestamp_utc = $!raw!timestamp_utc; -} -if($!raw!user_id != "") then { - set $!usr!sql!user_id = $!raw!user_id; -} - -# Define templates for SQL output via JSON records. -template(name="sql-json-record" type="subtree" subtree="$!usr!sql") - -# Output to Kafka. -# A disk-assisted memory queue is used for buffering. -local0.info action( - name="output-kafka" - type="omkafka" - broker=[{{kafka._rsyslog_broker}}] - topic="{{kafka.topic}}" - confParam=["compression.codec=gzip"] - template="sql-json-record" - queue.type="LinkedList" - queue.filename="queue-kafka" - queue.checkpointinterval="10" - queue.saveonshutdown="on" - queue.size="15000" - queue.highwatermark="10000" - queue.lowwatermark="2000" - action.resumeRetryCount="-1" -) -{{/analytics._output_kylin?}} - # Output to local file for redundancy until we prove the new system out. template(name="all-json-record" type="list") { property(name="$!raw") constant(value="\n") diff --git a/test/admin_ui/test_legacy_redirects.rb b/test/admin_ui/test_legacy_redirects.rb index d47799b3d..ee1b019e5 100644 --- a/test/admin_ui/test_legacy_redirects.rb +++ b/test/admin_ui/test_legacy_redirects.rb @@ -16,7 +16,7 @@ def setup def test_drilldown admin_login - visit "/admin/#/stats/drilldown/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour&beta_analytics=false®ion=US" + visit "/admin/#/stats/drilldown/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour®ion=US" assert_link("Download CSV", :href => /start_at=2015-01-15/) assert_current_admin_url("/stats/drilldown", { @@ -29,7 +29,7 @@ def test_drilldown def test_logs admin_login - visit "/admin/#/stats/logs/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour&beta_analytics=false®ion=US" + visit "/admin/#/stats/logs/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour®ion=US" assert_link("Download CSV", :href => /start_at=2015-01-15/) assert_current_admin_url("/stats/logs", { @@ -42,7 +42,7 @@ def test_logs def test_users admin_login - visit "/admin/#/stats/users/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour&beta_analytics=false®ion=US" + visit "/admin/#/stats/users/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour®ion=US" assert_link("Download CSV", :href => /start_at=2015-01-15/) assert_current_admin_url("/stats/users", { @@ -54,7 +54,7 @@ def test_users def test_map admin_login - visit "/admin/#/stats/map/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour&beta_analytics=false®ion=US" + visit "/admin/#/stats/map/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour®ion=US" assert_link("Download CSV", :href => /start_at=2015-01-15/) assert_current_admin_url("/stats/map", { diff --git a/test/admin_ui/test_stats_drilldown.rb b/test/admin_ui/test_stats_drilldown.rb index 5a5fc3ac3..8e0feeecd 100644 --- a/test/admin_ui/test_stats_drilldown.rb +++ b/test/admin_ui/test_stats_drilldown.rb @@ -43,7 +43,6 @@ def test_csv_download "search" => "", "query" => default_query, "prefix" => "0/", - "beta_analytics" => "false", }, uri.query_values.except("api_key")) assert_equal(40, uri.query_values["api_key"].length) diff --git a/test/admin_ui/test_stats_logs.rb b/test/admin_ui/test_stats_logs.rb index ea4cca855..21a79cdaa 100644 --- a/test/admin_ui/test_stats_logs.rb +++ b/test/admin_ui/test_stats_logs.rb @@ -59,7 +59,6 @@ def test_csv_download_link_changes_with_filters "end_at" => "2015-01-18", "interval" => "day", "query" => default_query, - "beta_analytics" => "false", }, uri.query_values) visit "/admin/#/stats/logs?search=&start_at=2015-01-13&end_at=2015-01-18&interval=day" @@ -74,7 +73,6 @@ def test_csv_download_link_changes_with_filters "end_at" => "2015-01-18", "interval" => "day", "query" => default_query, - "beta_analytics" => "false", }, uri.query_values) visit "/admin/#/stats/logs?search=&start_at=2015-01-12&end_at=2015-01-18&interval=day" @@ -94,7 +92,6 @@ def test_csv_download_link_changes_with_filters "interval" => "day", "query" => JSON.generate({ "condition" => "AND", "rules" => [], "valid" => true }), "search" => "", - "beta_analytics" => "false", }, uri.query_values) visit "/admin/#/stats/logs?search=&start_at=2015-01-13&end_at=2015-01-18&interval=day" @@ -114,7 +111,6 @@ def test_csv_download_link_changes_with_filters "end_at" => "2015-01-18", "interval" => "day", "query" => "", - "beta_analytics" => "false", }, uri.query_values) end @@ -173,22 +169,6 @@ def test_changing_intervals assert_link("Download CSV", :href => /interval=minute/) end - def test_does_not_show_beta_analytics_toggle_by_default - admin_login - visit "/admin/#/stats/logs?search=&start_at=2015-01-12&end_at=2015-01-18&interval=day" - assert_text("view top users") - refute_text("Beta Analytics") - end - - def test_shows_beta_analytics_toggle_when_enabled - override_config({ "analytics" => { "outputs" => ["kylin"] } }, nil) do - admin_login - visit "/admin/#/stats/logs?search=&start_at=2015-01-12&end_at=2015-01-18&interval=day" - assert_text("view top users") - assert_text("Beta Analytics") - end - end - def test_date_range_picker assert_date_range_picker("/stats/logs") end diff --git a/test/admin_ui/test_stats_map.rb b/test/admin_ui/test_stats_map.rb index c3f4a4b3a..667513bc2 100644 --- a/test/admin_ui/test_stats_map.rb +++ b/test/admin_ui/test_stats_map.rb @@ -42,7 +42,6 @@ def test_csv_download "search" => "", "query" => default_query, "region" => "world", - "beta_analytics" => "false", }, uri.query_values) # Wait for the ajax actions to fetch the graph and tables to both diff --git a/test/admin_ui/test_stats_users.rb b/test/admin_ui/test_stats_users.rb index be2889ca5..a57855b12 100644 --- a/test/admin_ui/test_stats_users.rb +++ b/test/admin_ui/test_stats_users.rb @@ -70,7 +70,6 @@ def test_csv_download "end_at" => "2015-01-18", "search" => "", "query" => default_query, - "beta_analytics" => "false", }, uri.query_values) # Wait for the ajax actions to fetch the graph and tables to both diff --git a/test/apis/admin/test_auth.rb b/test/apis/admin/test_auth.rb index c77a6f8a8..527a3f532 100644 --- a/test/apis/admin/test_auth.rb +++ b/test/apis/admin/test_auth.rb @@ -36,7 +36,6 @@ def test_authenticated "api_key", "api_umbrella_version", "authenticated", - "enable_beta_analytics", "local_auth_enabled", "password_length_min", "username_is_email", @@ -48,7 +47,6 @@ def test_authenticated assert_kind_of(String, data["api_key"]) assert_kind_of(String, data["api_umbrella_version"]) assert_includes([TrueClass, FalseClass], data["authenticated"].class) - assert_includes([TrueClass, FalseClass], data["enable_beta_analytics"].class) assert_equal([ "email",