- {{/if}}
-
{{#if enableInterval}}
{{#bs-button-group value=interval onChange=(action (mut interval)) type="radio" size="xs" id="interval_buttons" as |bg|}}
{{#bg.button value="minute"}}{{t "admin.stats.minute"}}{{/bg.button}}
diff --git a/src/api-umbrella/admin-ui/app/templates/stats/drilldown.hbs b/src/api-umbrella/admin-ui/app/templates/stats/drilldown.hbs
index 342c6874f..a6fe59169 100644
--- a/src/api-umbrella/admin-ui/app/templates/stats/drilldown.hbs
+++ b/src/api-umbrella/admin-ui/app/templates/stats/drilldown.hbs
@@ -1,4 +1,4 @@
-{{stats/query-form enableInterval=true date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval beta_analytics=beta_analytics allQueryParamValues=allQueryParamValues dateRanges=dateRanges}}
+{{stats/query-form enableInterval=true date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval allQueryParamValues=allQueryParamValues dateRanges=dateRanges}}
{{stats/drilldown/results-chart hitsOverTime=model.hits_over_time}}
{{stats/drilldown/results-breadcrumbs breadcrumbs=model.breadcrumbs}}
{{stats/drilldown/results-table results=model.results presentQueryParamValues=presentQueryParamValues backendQueryParamValues=backendQueryParamValues}}
diff --git a/src/api-umbrella/admin-ui/app/templates/stats/logs.hbs b/src/api-umbrella/admin-ui/app/templates/stats/logs.hbs
index e96aa5926..5c6c791af 100644
--- a/src/api-umbrella/admin-ui/app/templates/stats/logs.hbs
+++ b/src/api-umbrella/admin-ui/app/templates/stats/logs.hbs
@@ -1,4 +1,4 @@
-{{stats/query-form enableInterval=true date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval beta_analytics=beta_analytics allQueryParamValues=allQueryParamValues dateRanges=dateRanges}}
+{{stats/query-form enableInterval=true date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval allQueryParamValues=allQueryParamValues dateRanges=dateRanges}}
{{stats/logs/results-chart hitsOverTime=model.hits_over_time}}
{{stats/logs/results-highlights stats=model.stats aggregations=model.aggregations presentQueryParamValues=presentQueryParamValues}}
{{stats/logs/results-table presentQueryParamValues=presentQueryParamValues backendQueryParamValues=backendQueryParamValues}}
diff --git a/src/api-umbrella/admin-ui/app/templates/stats/map.hbs b/src/api-umbrella/admin-ui/app/templates/stats/map.hbs
index f34d0dc1c..38255aac3 100644
--- a/src/api-umbrella/admin-ui/app/templates/stats/map.hbs
+++ b/src/api-umbrella/admin-ui/app/templates/stats/map.hbs
@@ -1,4 +1,4 @@
-{{stats/query-form enableInterval=false date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval beta_analytics=beta_analytics allQueryParamValues=allQueryParamValues dateRanges=dateRanges}}
+{{stats/query-form enableInterval=false date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval allQueryParamValues=allQueryParamValues dateRanges=dateRanges}}
{{stats/map/results-breadcrumbs breadcrumbs=model.map_breadcrumbs}}
{{stats/map/results-map regions=model.map_regions regionField=model.region_field presentQueryParamValues=presentQueryParamValues allQueryParamValues=allQueryParamValues}}
diff --git a/src/api-umbrella/admin-ui/app/templates/stats/users.hbs b/src/api-umbrella/admin-ui/app/templates/stats/users.hbs
index 7c5e242ad..8eaf24cfc 100644
--- a/src/api-umbrella/admin-ui/app/templates/stats/users.hbs
+++ b/src/api-umbrella/admin-ui/app/templates/stats/users.hbs
@@ -1,2 +1,2 @@
-{{stats/query-form enableInterval=false date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval beta_analytics=beta_analytics allQueryParamValues=allQueryParamValues dateRanges=dateRanges}}
+{{stats/query-form enableInterval=false date_range=date_range start_at=start_at end_at=end_at query=query search=search interval=interval allQueryParamValues=allQueryParamValues dateRanges=dateRanges}}
{{stats/users/results-table presentQueryParamValues=presentQueryParamValues backendQueryParamValues=backendQueryParamValues}}
diff --git a/src/api-umbrella/cli/read_config.lua b/src/api-umbrella/cli/read_config.lua
index 2f14856d8..1cf6309e9 100644
--- a/src/api-umbrella/cli/read_config.lua
+++ b/src/api-umbrella/cli/read_config.lua
@@ -296,12 +296,6 @@ local function set_computed_config()
config["analytics"]["outputs"] = { config["analytics"]["adapter"] }
end
- config["kafka"]["_rsyslog_broker"] = {}
- for _, broker in ipairs(config["kafka"]["brokers"]) do
- table.insert(config["kafka"]["_rsyslog_broker"], '"' .. broker["host"] .. ":" .. broker["port"] .. '"')
- end
- config["kafka"]["_rsyslog_broker"] = table.concat(config["kafka"]["_rsyslog_broker"], ",")
-
-- Setup the request/response timeouts for the different pieces of the stack.
-- Since we traverse multiple proxies, we want to make sure the timeouts of
-- the different proxies are kept in sync.
@@ -337,7 +331,6 @@ local function set_computed_config()
["_development_env?"] = (config["app_env"] == "development"),
analytics = {
["_output_elasticsearch?"] = array_includes(config["analytics"]["outputs"], "elasticsearch"),
- ["_output_kylin?"] = array_includes(config["analytics"]["outputs"], "kylin"),
},
mongodb = {
_database = plutils.split(array_last(plutils.split(config["mongodb"]["url"], "/", true)), "?", true)[1],
@@ -361,7 +354,6 @@ local function set_computed_config()
},
["_service_general_db_enabled?"] = array_includes(config["services"], "general_db"),
["_service_log_db_enabled?"] = array_includes(config["services"], "log_db"),
- ["_service_hadoop_db_enabled?"] = array_includes(config["services"], "hadoop_db"),
["_service_router_enabled?"] = array_includes(config["services"], "router"),
["_service_web_enabled?"] = array_includes(config["services"], "web"),
["_service_nginx_reloader_enabled?"] = (array_includes(config["services"], "router") and config["nginx"]["_reloader_frequency"]),
diff --git a/src/api-umbrella/hadoop-analytics/.gitignore b/src/api-umbrella/hadoop-analytics/.gitignore
deleted file mode 100644
index fc81c16f9..000000000
--- a/src/api-umbrella/hadoop-analytics/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-/.classpath
-/.project
-/.settings
-/bin/
-/dependency-reduced-pom.xml
-/target/
diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/.gitignore b/src/api-umbrella/hadoop-analytics/elasticsearch-import/.gitignore
deleted file mode 100644
index c0ca0f21c..000000000
--- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-/.classpath
-/.project
-/.settings
-/dependency-reduced-pom.xml
-/migrate.log
-/target/
diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/README.md b/src/api-umbrella/hadoop-analytics/elasticsearch-import/README.md
deleted file mode 100644
index ed3b6ce96..000000000
--- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/README.md
+++ /dev/null
@@ -1,443 +0,0 @@
-```
-# Import from ElasticSearch to HDFS ORC files.
-$ cd hadoop-analytics
-$ mvn clean package
-$ sudo -u hive java -Dapiumbrella.page_size=10000 -Dapiumbrella.elasticsearch_url="http://ELASTICSEARCH_HOST:9200" -Dapiumbrella.hdfs_uri="hdfs://HDFS_HOST:8020" -Dapiumbrella.timezone=TIMEZONE -jar elasticsearch-import/target/elasticsearch-import-0.0.1-SNAPSHOT.jar
-
-# Create the Hive table.
-$ sudo -u hive hive
-hive> CREATE DATABASE api_umbrella;
-hive> CREATE EXTERNAL TABLE api_umbrella.logs(timestamp_utc BIGINT, id STRING, timestamp_tz_offset INT, timestamp_tz_hour STRING, timestamp_tz_minute STRING, user_id STRING, denied_reason STRING, request_method STRING, request_url_scheme STRING, request_url_host STRING, request_url_port INT, request_url_path STRING, request_url_path_level1 STRING, request_url_path_level2 STRING, request_url_path_level3 STRING, request_url_path_level4 STRING, request_url_path_level5 STRING, request_url_path_level6 STRING, request_url_query STRING, request_ip STRING, request_ip_country STRING, request_ip_region STRING, request_ip_city STRING, request_ip_lat DOUBLE, request_ip_lon DOUBLE, request_user_agent STRING, request_user_agent_type STRING, request_user_agent_family STRING, request_size INT, request_accept STRING, request_accept_encoding STRING, request_content_type STRING, request_connection STRING, request_origin STRING, request_referer STRING, request_basic_auth_username STRING, response_status SMALLINT, response_content_type STRING, response_content_length INT, response_content_encoding STRING, response_transfer_encoding STRING, response_server STRING, response_cache STRING, response_age INT, response_size INT, timer_response DOUBLE, timer_backend_response DOUBLE, timer_internal DOUBLE, timer_proxy_overhead DOUBLE, log_imported BOOLEAN) PARTITIONED BY (timestamp_tz_year DATE, timestamp_tz_month DATE, timestamp_tz_week DATE, timestamp_tz_date DATE) STORED AS ORC LOCATION '/apps/api-umbrella/logs';
-hive> exit;
-
-# Create the Kylin project and define the table data source.
-$ curl 'http://ADMIN:KYLIN@localhost:7070/kylin/api/projects' -H 'Content-Type: application/json;charset=UTF-8' --data-binary '{"name":"api_umbrella","description":""}'
-$ curl 'http://ADMIN:KYLIN@localhost:7070/kylin/api/tables/api_umbrella.logs/api_umbrella' -H 'Content-Type: application/json;charset=UTF-8' --data-binary '{}'
-
-# WAIT... Wait for the Kylin cardinality job to complete before proceeding
-# (progress is viewable in the kylin.log file). If you define the table source
-# after importing all the partitions, then the cardinality job will take forever
-# in Kylin. Since we don't actaully need the cardinality caclculations, we'll
-# just ensure they run against an empty table.
-#
-# If this issue gets addressed, then we would have more control over the
-# cardinality job, and wouldn't have to wait:
-# https://issues.apache.org/jira/browse/KYLIN-1407
-
-# Add all the daily partitions to the Hive table.
-$ sudo -u hive hadoop fs -ls -R /apps/api-umbrella/logs | grep -E "(\.orc|000000_0)$" | grep -o "[^ ]*$" | sort -V | sed -e "s/.*timestamp_tz_year=\([0-9\-]\+\).*timestamp_tz_month=\([0-9\-]\+\).*timestamp_tz_week=\([0-9\-]\+\).*timestamp_tz_date=\([0-9\-]\+\).*/ALTER TABLE api_umbrella.logs ADD IF NOT EXISTS PARTITION(timestamp_tz_year='\1', timestamp_tz_month='\2', timestamp_tz_week='\3', timestamp_tz_date='\4');/" | uniq > /tmp/api_umbrella_load_partitions.sql
-$ sudo -u hive hive -f /tmp/api_umbrella_load_partitions.sql
-
-# Create the model.
-$ echo '{"modelDescData":"{
- \"name\": \"logs_model\",
- \"description\": \"\",
- \"fact_table\": \"API_UMBRELLA.LOGS\",
- \"lookups\": [],
- \"filter_condition\": \"\",
- \"capacity\": \"MEDIUM\",
- \"dimensions\": [
- {
- \"table\": \"API_UMBRELLA.LOGS\",
- \"columns\": [
- \"TIMESTAMP_TZ_YEAR\",
- \"TIMESTAMP_TZ_MONTH\",
- \"TIMESTAMP_TZ_WEEK\",
- \"TIMESTAMP_TZ_DATE\",
- \"REQUEST_URL_HOST\",
- \"REQUEST_URL_PATH_LEVEL1\",
- \"REQUEST_URL_PATH_LEVEL2\",
- \"REQUEST_URL_PATH_LEVEL3\",
- \"REQUEST_URL_PATH_LEVEL4\",
- \"REQUEST_URL_PATH_LEVEL5\",
- \"REQUEST_URL_PATH_LEVEL6\",
- \"USER_ID\",
- \"REQUEST_IP\",
- \"RESPONSE_STATUS\",
- \"DENIED_REASON\",
- \"REQUEST_METHOD\",
- \"REQUEST_IP_COUNTRY\",
- \"REQUEST_IP_REGION\",
- \"REQUEST_IP_CITY\"
- ]
- }
- ],
- \"metrics\": [
- \"USER_ID\",
- \"REQUEST_IP\",
- \"TIMER_RESPONSE\",
- \"TIMESTAMP_UTC\"
- ],
- \"partition_desc\": {
- \"partition_date_column\": \"API_UMBRELLA.LOGS.TIMESTAMP_TZ_DATE\",
- \"partition_date_format\": \"yyyy-MM-dd\",
- \"partition_date_start\": null,
- \"partition_type\": \"APPEND\"
- },
- \"last_modified\": 0
-}","project":"api_umbrella"}' | perl -p -e 's/\n/\\n/' | curl -v -XPOST -H "Content-Type: application/json;charset=UTF-8" --data-binary @- "http://ADMIN:KYLIN@localhost:7070/kylin/api/models"
-
-# Create the cube.
-$ echo '{"cubeDescData":"{
- \"name\": \"logs_cube\",
- \"model_name\": \"logs_model\",
- \"description\": \"\",
- \"dimensions\": [
- {
- \"name\": \"API_UMBRELLA.LOGS.USER_ID\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"USER_ID\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.DENIED_REASON\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"DENIED_REASON\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.REQUEST_METHOD\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"REQUEST_METHOD\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_HOST\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"REQUEST_URL_HOST\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_PATH_LEVEL1\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"REQUEST_URL_PATH_LEVEL1\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_PATH_LEVEL2\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"REQUEST_URL_PATH_LEVEL2\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_PATH_LEVEL3\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"REQUEST_URL_PATH_LEVEL3\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_PATH_LEVEL4\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"REQUEST_URL_PATH_LEVEL4\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_PATH_LEVEL5\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"REQUEST_URL_PATH_LEVEL5\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.REQUEST_URL_PATH_LEVEL6\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"REQUEST_URL_PATH_LEVEL6\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.REQUEST_IP\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"REQUEST_IP\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.REQUEST_IP_COUNTRY\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"REQUEST_IP_COUNTRY\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.REQUEST_IP_REGION\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"REQUEST_IP_REGION\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.REQUEST_IP_CITY\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"REQUEST_IP_CITY\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.RESPONSE_STATUS\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"RESPONSE_STATUS\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.TIMESTAMP_TZ_YEAR\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"TIMESTAMP_TZ_YEAR\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.TIMESTAMP_TZ_MONTH\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"TIMESTAMP_TZ_MONTH\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.TIMESTAMP_TZ_WEEK\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"TIMESTAMP_TZ_WEEK\",
- \"derived\": null
- },
- {
- \"name\": \"API_UMBRELLA.LOGS.TIMESTAMP_TZ_DATE\",
- \"table\": \"API_UMBRELLA.LOGS\",
- \"column\": \"TIMESTAMP_TZ_DATE\",
- \"derived\": null
- }
- ],
- \"measures\": [
- {
- \"name\": \"_COUNT_\",
- \"function\": {
- \"expression\": \"COUNT\",
- \"parameter\": {
- \"type\": \"constant\",
- \"value\": \"1\",
- \"next_parameter\": null
- },
- \"returntype\": \"bigint\"
- },
- \"dependent_measure_ref\": null
- },
- {
- \"name\": \"COUNT_DISTINCT_USER_ID\",
- \"function\": {
- \"expression\": \"COUNT_DISTINCT\",
- \"parameter\": {
- \"type\": \"column\",
- \"value\": \"USER_ID\",
- \"next_parameter\": null
- },
- \"returntype\": \"hllc12\"
- },
- \"dependent_measure_ref\": null
- },
- {
- \"name\": \"COUNT_DISTINCT_REQUEST_IP\",
- \"function\": {
- \"expression\": \"COUNT_DISTINCT\",
- \"parameter\": {
- \"type\": \"column\",
- \"value\": \"REQUEST_IP\",
- \"next_parameter\": null
- },
- \"returntype\": \"hllc12\"
- },
- \"dependent_measure_ref\": null
- },
- {
- \"name\": \"SUM_TIMER_RESPONSE\",
- \"function\": {
- \"expression\": \"SUM\",
- \"parameter\": {
- \"type\": \"column\",
- \"value\": \"TIMER_RESPONSE\",
- \"next_parameter\": null
- },
- \"returntype\": \"decimal\"
- },
- \"dependent_measure_ref\": null
- },
- {
- \"name\": \"MAX_TIMESTAMP_UTC\",
- \"function\": {
- \"expression\": \"MAX\",
- \"parameter\": {
- \"type\": \"column\",
- \"value\": \"TIMESTAMP_UTC\",
- \"next_parameter\": null
- },
- \"returntype\": \"bigint\"
- },
- \"dependent_measure_ref\": null
- }
- ],
- \"rowkey\": {
- \"rowkey_columns\": [
- {
- \"column\": \"TIMESTAMP_TZ_YEAR\",
- \"encoding\": \"dict\"
- },
- {
- \"column\": \"TIMESTAMP_TZ_MONTH\",
- \"encoding\": \"dict\"
- },
- {
- \"column\": \"TIMESTAMP_TZ_WEEK\",
- \"encoding\": \"dict\"
- },
- {
- \"column\": \"TIMESTAMP_TZ_DATE\",
- \"encoding\": \"dict\"
- },
- {
- \"column\": \"REQUEST_URL_HOST\",
- \"encoding\": \"dict\"
- },
- {
- \"column\": \"REQUEST_URL_PATH_LEVEL1\",
- \"encoding\": \"fixed_length:40\"
- },
- {
- \"column\": \"DENIED_REASON\",
- \"encoding\": \"dict\"
- },
- {
- \"column\": \"USER_ID\",
- \"encoding\": \"fixed_length:36\"
- },
- {
- \"column\": \"REQUEST_IP\",
- \"encoding\": \"fixed_length:45\"
- },
- {
- \"column\": \"REQUEST_URL_PATH_LEVEL2\",
- \"encoding\": \"fixed_length:40\"
- },
- {
- \"column\": \"REQUEST_URL_PATH_LEVEL3\",
- \"encoding\": \"fixed_length:40\"
- },
- {
- \"column\": \"REQUEST_URL_PATH_LEVEL4\",
- \"encoding\": \"fixed_length:40\"
- },
- {
- \"column\": \"REQUEST_URL_PATH_LEVEL5\",
- \"encoding\": \"fixed_length:40\"
- },
- {
- \"column\": \"REQUEST_URL_PATH_LEVEL6\",
- \"encoding\": \"fixed_length:40\"
- },
- {
- \"column\": \"REQUEST_IP_COUNTRY\",
- \"encoding\": \"dict\"
- },
- {
- \"column\": \"REQUEST_IP_REGION\",
- \"encoding\": \"dict\"
- },
- {
- \"column\": \"REQUEST_IP_CITY\",
- \"encoding\": \"dict\"
- },
- {
- \"column\": \"RESPONSE_STATUS\",
- \"encoding\": \"dict\"
- },
- {
- \"column\": \"REQUEST_METHOD\",
- \"encoding\": \"dict\"
- }
- ]
- },
- \"hbase_mapping\": {
- \"column_family\": [
- {
- \"name\": \"f1\",
- \"columns\": [
- {
- \"qualifier\": \"m\",
- \"measure_refs\": [
- \"_COUNT_\",
- \"SUM_TIMER_RESPONSE\",
- \"MAX_TIMESTAMP_UTC\"
- ]
- }
- ]
- },
- {
- \"name\": \"f2\",
- \"columns\": [
- {
- \"qualifier\": \"m\",
- \"measure_refs\": [
- \"COUNT_DISTINCT_USER_ID\",
- \"COUNT_DISTINCT_REQUEST_IP\"
- ]
- }
- ]
- }
- ]
- },
- \"aggregation_groups\": [
- {
- \"includes\": [
- \"TIMESTAMP_TZ_YEAR\",
- \"TIMESTAMP_TZ_MONTH\",
- \"TIMESTAMP_TZ_WEEK\",
- \"TIMESTAMP_TZ_DATE\",
- \"REQUEST_URL_HOST\",
- \"REQUEST_URL_PATH_LEVEL1\",
- \"REQUEST_URL_PATH_LEVEL2\",
- \"REQUEST_URL_PATH_LEVEL3\",
- \"REQUEST_URL_PATH_LEVEL4\",
- \"REQUEST_URL_PATH_LEVEL5\",
- \"REQUEST_URL_PATH_LEVEL6\",
- \"USER_ID\",
- \"REQUEST_IP\",
- \"RESPONSE_STATUS\",
- \"DENIED_REASON\",
- \"REQUEST_METHOD\",
- \"REQUEST_IP_COUNTRY\",
- \"REQUEST_IP_REGION\",
- \"REQUEST_IP_CITY\"
- ],
- \"select_rule\": {
- \"hierarchy_dims\": [
- [
- \"REQUEST_URL_HOST\",
- \"REQUEST_URL_PATH_LEVEL1\",
- \"REQUEST_URL_PATH_LEVEL2\",
- \"REQUEST_URL_PATH_LEVEL3\",
- \"REQUEST_URL_PATH_LEVEL4\",
- \"REQUEST_URL_PATH_LEVEL5\",
- \"REQUEST_URL_PATH_LEVEL6\"
- ],
- [
- \"REQUEST_IP_COUNTRY\",
- \"REQUEST_IP_REGION\",
- \"REQUEST_IP_CITY\"
- ]
- ],
- \"mandatory_dims\": [
- \"TIMESTAMP_TZ_YEAR\",
- \"TIMESTAMP_TZ_MONTH\",
- \"TIMESTAMP_TZ_WEEK\",
- \"TIMESTAMP_TZ_DATE\"
- ],
- \"joint_dims\": [
- [
- \"USER_ID\",
- \"REQUEST_IP\"
- ]
- ]
- }
- }
- ],
- \"notify_list\": [],
- \"status_need_notify\": [],
- \"partition_date_start\": 1281916800000,
- \"auto_merge_time_ranges\": [
- 604800000,
- 2419200000
- ],
- \"retention_range\": 0,
- \"engine_type\": 2,
- \"storage_type\": 2
-}","cubeName":"logs_cube","project":"api_umbrella","streamingCube":false}' | perl -p -e 's/\n/\\n/' | curl -v -XPOST -H "Content-Type: application/json;charset=UTF-8" --data-binary @- "http://ADMIN:KYLIN@localhost:7070/kylin/api/cubes"
-```
diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/logging.properties b/src/api-umbrella/hadoop-analytics/elasticsearch-import/logging.properties
deleted file mode 100644
index d8c1d20e4..000000000
--- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/logging.properties
+++ /dev/null
@@ -1,2 +0,0 @@
-org.apache.parquet.handlers=java.util.logging.ConsoleHandler
-java.util.logging.ConsoleHandler.level=SEVERE
diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/pom.xml b/src/api-umbrella/hadoop-analytics/elasticsearch-import/pom.xml
deleted file mode 100644
index 65221aecd..000000000
--- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/pom.xml
+++ /dev/null
@@ -1,114 +0,0 @@
-
- 4.0.0
-
-
- api-umbrella.hadoop-analytics
- hadoop-analytics
- 0.0.1-SNAPSHOT
-
- elasticsearch-import
- jar
-
- elasticsearch-import
- http://maven.apache.org
-
-
- UTF-8
- 1.7
- 1.7
-
-
-
-
-
- api-umbrella.hadoop-analytics
- schema
- 0.0.1-SNAPSHOT
-
-
-
- io.searchbox
- jest
- 2.0.0
-
-
-
-
- joda-time
- joda-time
- 2.9.2
-
-
-
- org.slf4j
- slf4j-simple
- 1.7.16
-
-
-
- org.apache.hive
- hive-exec
- 0.14.0
-
-
-
- org.apache.hadoop
- hadoop-client
- 2.6.0
-
-
-
- org.apache.calcite
- calcite-core
- 0.9.2-incubating
-
-
- org.apache.calcite
- calcite-avatica
- 0.9.2-incubating
-
-
-
- ${buildDir}/elasticsearch-import
-
-
-
- org.apache.maven.plugins
- maven-shade-plugin
- 2.4.3
-
-
- package
-
- shade
-
-
-
-
- apiumbrella.hadoop_analytics.App
-
-
-
-
-
- *:*
-
- META-INF/*.SF
- META-INF/*.DSA
- META-INF/*.RSA
-
-
-
-
-
-
-
-
-
-
diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/App.java b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/App.java
deleted file mode 100644
index acacf1d51..000000000
--- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/App.java
+++ /dev/null
@@ -1,145 +0,0 @@
-package apiumbrella.hadoop_analytics;
-
-import java.io.IOException;
-import java.math.BigInteger;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Map.Entry;
-import java.util.Set;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.joda.time.DateTime;
-import org.joda.time.DateTimeZone;
-import org.joda.time.Period;
-import org.joda.time.format.DateTimeFormatter;
-import org.joda.time.format.ISODateTimeFormat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.gson.JsonElement;
-
-import io.searchbox.client.JestClient;
-import io.searchbox.client.JestClientFactory;
-import io.searchbox.client.JestResult;
-import io.searchbox.client.config.HttpClientConfig;
-import io.searchbox.indices.aliases.GetAliases;
-
-public class App {
- protected static final String HDFS_URI =
- System.getProperty("apiumbrella.hdfs_uri", "hdfs://127.0.0.1:8020");
- protected static String DIR = System.getProperty("apiumbrella.dir", "/apps/api-umbrella/logs");
- protected static String ELASTICSEARCH_URL =
- System.getProperty("apiumbrella.elasticsearch_url", "http://localhost:9200");
- protected static int PAGE_SIZE =
- Integer.parseInt(System.getProperty("apiumbrella.page_size", "5000"));
- protected static int CONCURRENCY =
- Integer.parseInt(System.getProperty("apiumbrella.concurrency", "4"));
- protected static String START_DATE = System.getProperty("apiumbrella.start_date");
- protected static String END_DATE = System.getProperty("apiumbrella.end_date");
- protected static DateTimeZone TIMEZONE =
- DateTimeZone.forID(System.getProperty("apiumbrella.timezone", "UTC"));
-
- // Define fields we won't migrate to the new database.
- protected static final Set
SKIP_FIELDS = new HashSet(Arrays.asList(new String[] {
- // These URL-related fields are handled specially when dealing with the
- // request_url field.
- "request_scheme", "request_host", "request_path", "request_query",
-
- // We're no longer storing the special hierarchy field.
- "request_hierarchy", "request_path_hierarchy",
-
- // We're only storing the user_id, and not other fields that can be derived
- // from it (lookups on these fields will need to first query the user table,
- // and then perform queries base don user_id).
- "user_registration_source", "user_email", "api_key",
-
- // Old timer field from the nodejs stack, that's not relevant anymore, and
- // we don't need to migrate over (it was always somewhat duplicative).
- "internal_response_time",
-
- // Junk field we've seen on some old data.
- "_type"}));
-
- final Logger logger = LoggerFactory.getLogger(App.class);
- private BigInteger globalHits = BigInteger.valueOf(0);
-
- public App() {
- System.out.println("Logging to log/elasticsearch-import.log...");
-
- ExecutorService executor = Executors.newFixedThreadPool(CONCURRENCY);
-
- DateTime date = this.getStartDate();
- DateTime endDate = this.getEndDate();
- while (date.isBefore(endDate)) {
- Runnable worker = new DayWorker(this, date);
- executor.execute(worker);
-
- date = date.plus(Period.days(1));
- }
-
- executor.shutdown();
- while (!executor.isTerminated()) {
- }
- logger.info("Finished all threads");
- }
-
- protected synchronized BigInteger incrementGlobalHits(Integer total) {
- this.globalHits = this.globalHits.add(BigInteger.valueOf(total));
- return this.globalHits;
- }
-
- private DateTime getStartDate() {
- if (START_DATE != null) {
- DateTimeFormatter dateParser = ISODateTimeFormat.dateParser().withZone(TIMEZONE);
- return dateParser.parseDateTime(START_DATE);
- }
-
- JestClientFactory factory = new JestClientFactory();
- factory.setHttpClientConfig(
- new HttpClientConfig.Builder(ELASTICSEARCH_URL).multiThreaded(true).build());
- GetAliases aliases = new GetAliases.Builder().build();
- JestClient client = factory.getObject();
- DateTime first = null;
- try {
- JestResult result = client.execute(aliases);
- if (!result.isSucceeded()) {
- logger.error(result.getErrorMessage());
- System.exit(1);
- }
-
- for (Entry entry : result.getJsonObject().entrySet()) {
- Pattern pattern = Pattern.compile("^api-umbrella.*-([0-9]{4})-([0-9]{2})$");
- Matcher matches = pattern.matcher(entry.getKey());
- if (matches.find()) {
- DateTime indexDate =
- new DateTime(Integer.parseInt(matches.group(1)), Integer.parseInt(matches.group(2)),
- 1, 0, 0, 0, DateTimeZone.UTC).withZone(TIMEZONE).withTime(0, 0, 0, 0);
- if (first == null || indexDate.isBefore(first)) {
- first = indexDate;
- }
- }
- }
- } catch (IOException e) {
- e.printStackTrace();
- System.exit(1);
- }
-
- return first;
- }
-
- private DateTime getEndDate() {
- if (END_DATE != null) {
- DateTimeFormatter dateParser = ISODateTimeFormat.dateParser().withZone(TIMEZONE);
- return dateParser.parseDateTime(END_DATE);
- }
-
- return new DateTime();
- }
-
- public static void main(String[] args) throws SecurityException, IOException {
- new App();
- }
-}
diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/DayWorker.java b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/DayWorker.java
deleted file mode 100644
index 702705595..000000000
--- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/DayWorker.java
+++ /dev/null
@@ -1,597 +0,0 @@
-package apiumbrella.hadoop_analytics;
-
-import java.io.IOException;
-import java.math.BigInteger;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.net.URLEncoder;
-import java.nio.file.Paths;
-import java.text.NumberFormat;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.avro.Schema;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.io.orc.CompressionKind;
-import org.apache.hadoop.hive.ql.io.orc.OrcFile;
-import org.apache.hadoop.hive.ql.io.orc.OrcFile.WriterOptions;
-import org.apache.hadoop.hive.ql.io.orc.Writer;
-import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.hive.serde2.io.ShortWritable;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.StructField;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
-import org.apache.hadoop.io.BooleanWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.joda.time.DateTime;
-import org.joda.time.DateTimeZone;
-import org.joda.time.Period;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-import org.joda.time.format.ISODateTimeFormat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.gson.JsonArray;
-import com.google.gson.JsonElement;
-import com.google.gson.JsonObject;
-import com.google.gson.JsonPrimitive;
-
-import io.searchbox.client.JestClient;
-import io.searchbox.client.JestClientFactory;
-import io.searchbox.client.JestResult;
-import io.searchbox.client.config.HttpClientConfig;
-import io.searchbox.core.Search;
-import io.searchbox.core.SearchScroll;
-import io.searchbox.params.Parameters;
-
-public class DayWorker implements Runnable {
- final Logger logger = LoggerFactory.getLogger(DayWorker.class);
-
- private DateTime dayStartTime;
- private DateTime dayEndTime;
- private static LogSchema schema;
- private App app;
- private int totalProcessedHits = 0;
- private int totalHits;
- private static WriterOptions orcWriterOptions;
- private Writer orcWriter;
- DateTimeFormatter dateTimeParser = ISODateTimeFormat.dateTimeParser();
- DateTimeFormatter dateTimeFormatter =
- DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(App.TIMEZONE);
- DateTimeFormatter dateFormatter = ISODateTimeFormat.date().withZone(App.TIMEZONE);
-
- public DayWorker(App app, DateTime date) {
- this.app = app;
- dayStartTime = date;
- dayEndTime = this.dayStartTime.plus(Period.days(1));
- schema = new LogSchema();
- }
-
- public void run() {
- try {
- JestClientFactory factory = new JestClientFactory();
- factory.setHttpClientConfig(new HttpClientConfig.Builder(App.ELASTICSEARCH_URL)
- .multiThreaded(true).connTimeout(60000).readTimeout(120000).build());
- JestClient client = factory.getObject();
-
- // Perform a scroll query to fetch the specified day's data from
- // elasticsearch.
- String query = "{" + //
- " \"sort\":\"request_at\"," + //
- " \"query\":{" + //
- " \"filtered\":{" + //
- " \"filter\":{" + //
- " \"range\":{" + //
- " \"request_at\":{" + //
- " \"gte\":" + this.dayStartTime.getMillis() + "," + //
- " \"lt\":" + this.dayEndTime.getMillis() + //
- " }" + //
- " }" + //
- " }" + //
- " }" + //
- " }" + //
- "}";
- // Query the indexes that cover this day (this may involve multiple indexes on month
- // boundaries, if we're converting into a timezone, since the indexes are UTC-based).
- String startTimeIndex = "api-umbrella-logs-"
- + ISODateTimeFormat.yearMonth().withZone(DateTimeZone.UTC).print(dayStartTime);
- String endTimeIndex = "api-umbrella-logs-"
- + ISODateTimeFormat.yearMonth().withZone(DateTimeZone.UTC).print(dayEndTime);
- Search search = new Search.Builder(query).ignoreUnavailable(true).addIndex(startTimeIndex)
- .addIndex(endTimeIndex).setParameter(Parameters.SIZE, App.PAGE_SIZE)
- .setParameter(Parameters.SCROLL, "3m").build();
-
- JestResult result = client.execute(search);
- if (!result.isSucceeded()) {
- logger.error(result.getErrorMessage());
- System.exit(1);
- }
-
- String scrollId = result.getJsonObject().get("_scroll_id").getAsString();
- this.totalHits = result.getJsonObject().getAsJsonObject("hits").get("total").getAsInt();
-
- while (true) {
- // Keep looping until the scroll result returns no results.
- if (!processResult(result)) {
- break;
- }
-
- SearchScroll scroll = new SearchScroll.Builder(scrollId, "3m").build();
- result = client.execute(scroll);
- if (!result.isSucceeded()) {
- logger.error(result.getErrorMessage());
- System.exit(1);
- }
-
- scrollId = result.getJsonObject().get("_scroll_id").getAsString();
- }
-
- // Close the data file (but only if it exists, so we skip over days
- // with no data).
- if (this.orcWriter != null) {
- this.orcWriter.close();
- this.orcWriter = null;
- }
- } catch (Exception e) {
- logger.error("Unexpected error", e);
- System.exit(1);
- }
- }
-
- private static WriterOptions getOrcWriterOptions() throws IOException {
- if (orcWriterOptions == null) {
- ArrayList orcFields = new ArrayList();
- for (int i = 0; i < schema.getNonPartitionFieldsList().size(); i++) {
- String field = schema.getNonPartitionFieldsList().get(i);
- Schema.Type type = schema.getFieldType(field);
-
- ObjectInspector inspector;
- if (type == Schema.Type.INT) {
- if (schema.isFieldTypeShort(field)) {
- inspector = PrimitiveObjectInspectorFactory.writableShortObjectInspector;
- } else {
- inspector = PrimitiveObjectInspectorFactory.writableIntObjectInspector;
- }
- } else if (type == Schema.Type.LONG) {
- inspector = PrimitiveObjectInspectorFactory.writableLongObjectInspector;
- } else if (type == Schema.Type.DOUBLE) {
- inspector = PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
- } else if (type == Schema.Type.BOOLEAN) {
- inspector = PrimitiveObjectInspectorFactory.writableBooleanObjectInspector;
- } else if (type == Schema.Type.STRING) {
- inspector = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
- } else {
- throw new IOException("Unknown type: " + type.toString());
- }
-
- orcFields.add(new OrcField(field, inspector, i));
- }
-
- Configuration conf = new Configuration();
- // Fix for hadoop jar ordering: http://stackoverflow.com/a/21118824
- conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
- conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
-
- orcWriterOptions = OrcFile.writerOptions(conf);
- orcWriterOptions.compress(CompressionKind.ZLIB);
- orcWriterOptions.inspector(new OrcRowInspector(orcFields));
- }
-
- return orcWriterOptions;
- }
-
- private Writer getOrcWriter() throws IOException {
- if (this.orcWriter == null) {
- String date = dateFormatter.print(dayStartTime);
- // Create a new file in /dir/YYYY/MM/WW/YYYY-MM-DD.par
- Path path = new Path(App.HDFS_URI + Paths.get(App.DIR,
- "timestamp_tz_year=" + dateFormatter.print(dayStartTime.withDayOfYear(1)),
- "timestamp_tz_month=" + dateFormatter.print(dayStartTime.withDayOfMonth(1)),
- "timestamp_tz_week=" + dateFormatter.print(dayStartTime.withDayOfWeek(1)),
- "timestamp_tz_date=" + date, date + ".orc"));
- this.orcWriter = OrcFile.createWriter(path, getOrcWriterOptions());
- }
-
- return this.orcWriter;
- }
-
- private boolean processResult(JestResult result) throws Exception {
- JsonArray hits = result.getJsonObject().getAsJsonObject("hits").getAsJsonArray("hits");
-
- int pageHits = hits.size();
- if (pageHits == 0) {
- return false;
- }
-
- this.totalProcessedHits += pageHits;
-
- BigInteger globalHits = this.app.incrementGlobalHits(pageHits);
- NumberFormat numberFormatter = NumberFormat.getNumberInstance(Locale.US);
- DateTime firstRequestAt = this.parseTimestamp(
- hits.get(0).getAsJsonObject().get("_source").getAsJsonObject().get("request_at"));
- logger.info(String.format("Processing %s to %s | %10s / %10s | %12s | %s", this.dayStartTime,
- this.dayEndTime, numberFormatter.format(this.totalProcessedHits),
- numberFormatter.format(this.totalHits), numberFormatter.format(globalHits),
- firstRequestAt));
-
- for (int i = 0; i < pageHits; i++) {
- JsonObject hit = hits.get(i).getAsJsonObject();
- this.processHit(hit);
- }
-
- return true;
- }
-
- private void processHit(JsonObject hit) throws Exception {
- JsonObject source = hit.get("_source").getAsJsonObject();
-
- try {
- // For each hit, create a new Avro record to serialize it into the new
- // format for storage.
- GenericRecord log = new GenericData.Record(schema.getSchema());
- log.put("id", hit.get("_id").getAsString());
-
- // Loop over each attribute in the source data, assigning each value to
- // the new data record.
- for (Map.Entry entry : source.entrySet()) {
- String key = entry.getKey();
-
- // Skip this field if we've explicitly marked it as not migrating.
- if (App.SKIP_FIELDS.contains(key)) {
- continue;
- }
-
- JsonElement value = entry.getValue();
-
- // Skip setting anything if the value is null.
- if (value == null || value.isJsonNull()) {
- continue;
- }
-
- // Handle special processing for certain fields.
- switch (key) {
- case "request_at":
- // Split up the timestamp into several fields for better compatibility
- // with the Kylin's cube's that will be created (which doesn't support
- // timestamps yet).
- DateTime requestAt = this.parseTimestamp(value);
- log.put("timestamp_utc", requestAt.getMillis());
- log.put("timestamp_tz_offset", App.TIMEZONE.getOffset(requestAt.getMillis()));
- log.put("timestamp_tz_year", dateFormatter.print(requestAt.withDayOfYear(1)));
- log.put("timestamp_tz_month", dateFormatter.print(requestAt.withDayOfMonth(1)));
- log.put("timestamp_tz_week", dateFormatter.print(requestAt.withDayOfWeek(1)));
- log.put("timestamp_tz_date", dateFormatter.print(requestAt));
- log.put("timestamp_tz_hour", dateTimeFormatter
- .print(requestAt.withMinuteOfHour(0).withSecondOfMinute(0).withMillisOfSecond(0)));
- log.put("timestamp_tz_minute",
- dateTimeFormatter.print(requestAt.withSecondOfMinute(0).withMillisOfSecond(0)));
- value = null;
- break;
- case "request_ip_location":
- // Flatten the location object into two separate fields.
- log.put("request_ip_lat", value.getAsJsonObject().get("lat").getAsDouble());
- log.put("request_ip_lon", value.getAsJsonObject().get("lon").getAsDouble());
- value = null;
- break;
- case "request_url":
- // Perform various cleanup and sanity checks on storing the URL as
- // separate fields (versus the duplicative separate fields plus a full
- // URL field). The full URL field sometimes differs in the data versus
- // the individual fields, so we want to make sure we're transferring
- // the best data possible and not losing anything in the process.
- URL url;
- try {
- url = new URL(value.getAsString());
- } catch (MalformedURLException e) {
- try {
- // Cleanup some oddities in some invalid URLs seen (I think from
- // localhost testing).
- url = new URL(value.getAsString().replace(":80:80/", ":80/").replace("://[", "://")
- .replace("]/", "/"));
- } catch (MalformedURLException e2) {
- logger.error(hit.toString());
- throw (e2);
- }
- }
-
- // Store the original request_scheme, since that seems to be more
- // accurate than sometimes incorrect http:// urls on request_url that
- // are actually https.
- String requestScheme = source.get("request_scheme").getAsString();
- if (!url.getProtocol().equals(requestScheme)) {
- logger.warn("request_url's scheme (" + url.getProtocol()
- + ") does not match request_scheme (" + requestScheme + ")");
- }
- log.put("request_url_scheme", requestScheme);
-
- // Store the host extracted from the full URL, since that seems more
- // accurate than the separate request_host field (it seems to better
- // handle some odd invalid hostnames, which probably don't actually
- // matter too much).
- String requestHost = source.get("request_host").getAsString().toLowerCase();
- String urlHost = url.getHost().toLowerCase();
- if (!urlHost.equals(requestHost)) {
- logger.warn("request_url's host (" + url.getHost() + ") does not match request_host ("
- + requestHost + ")");
- }
- log.put("request_url_host", urlHost);
-
- // As a new field, store the port used. Most of the time this will be
- // the default 80 or 443, depending on the scheme.
- int urlPort = url.getPort();
- if (log.get("request_url_scheme").equals("https") && urlPort == 80) {
- log.put("request_url_port", 443);
- } else {
- // If the port isn't set, or it's 50090, set it to the default port
- // based on the scheme. We're ignoring port 50090, since this is
- // present on some of our rather old imported logs, and was an
- // internal-only port that was used (but was never public, so this
- // isn't accurate).
- if (urlPort == -1 || urlPort == 50090) {
- if (log.get("request_url_scheme").equals("https")) {
- log.put("request_url_port", 443);
- } else {
- log.put("request_url_port", 80);
- }
- } else {
- log.put("request_url_port", urlPort);
- }
- }
-
- // Store the path extracted from the full URL, since it seems to be
- // more accurate at dealing with odd URL encoding issues.
- String requestPath = source.get("request_path").getAsString();
- if (!url.getPath().equals(requestPath)) {
- // Before throwing a warning, ignore some semi-common URL encoding
- // differences between the full URL and the request_path field
- // (where we're comfortable with the encoding of the full URL's
- // version). Also deal with missing hash fragment details.
- String encodedUrlPath = url.getPath();
- if (url.getRef() != null && url.getQuery() == null) {
- encodedUrlPath += "#" + url.getRef();
- }
- encodedUrlPath = URLEncoder.encode(encodedUrlPath, "UTF-8");
- encodedUrlPath = encodedUrlPath.replace("%25", "%");
-
- String encodedRequestPath = requestPath.replaceAll("/(x[0-9])", "\\\\$1");
- encodedRequestPath = URLEncoder.encode(encodedRequestPath, "UTF-8");
- encodedRequestPath = encodedRequestPath.replace("%25", "%");
-
- if (!encodedUrlPath.equals(encodedRequestPath)) {
- logger.warn("request_url's path (" + url.getPath() + " - " + encodedUrlPath
- + ") does not match request_path (" + requestPath + " - " + encodedRequestPath
- + ")");
- }
- }
- log.put("request_url_path", url.getPath());
-
- String[] pathLevels = StringUtils.split(url.getPath(), "/", 6);
- for (int i = 0; i < pathLevels.length; i++) {
- String pathLevel = pathLevels[i];
- if (i < pathLevels.length - 1) {
- pathLevel = pathLevel + "/";
- }
- if (i == 0 && pathLevel != "/") {
- pathLevel = "/" + pathLevel;
- }
- log.put("request_url_path_level" + (i + 1), pathLevel);
- }
-
- // Store the query string extracted from the full URL.
- String requestQuery = url.getQuery();
- log.put("request_url_query", requestQuery);
-
- // If a hash fragment is present in the full URL, this is actually a
- // flag that something's fishy with the URL encoding, since our
- // server-side logs can't possible contain fragment information. So
- // we'll assume this information actually represents something
- // following a URL-encoded hash fragment, and append that to the
- // appropriate place.
- String urlRef = url.getRef();
- if (urlRef != null) {
- if (log.get("request_url_query") != null) {
- log.put("request_url_query", log.get("request_url_query") + "%23" + urlRef);
- } else {
- log.put("request_url_path", log.get("request_url_path") + "%23" + urlRef);
- }
- }
-
- // Re-assemble the URL based on all of our newly stored individual
- // componetns.
- String reassmbledUrl =
- log.get("request_url_scheme") + "://" + log.get("request_url_host");
- if (log.get("request_url_scheme").equals("http")) {
- if ((Integer) log.get("request_url_port") != 80) {
- reassmbledUrl = reassmbledUrl + ":" + log.get("request_url_port");
- }
- } else if (log.get("request_url_scheme").equals("https")) {
- if ((Integer) log.get("request_url_port") != 443) {
- reassmbledUrl = reassmbledUrl + ":" + log.get("request_url_port");
- }
- } else {
- reassmbledUrl = reassmbledUrl + ":" + log.get("request_url_port");
- }
- reassmbledUrl = reassmbledUrl + log.get("request_url_path");
- if (requestQuery != null) {
- reassmbledUrl = reassmbledUrl + "?" + log.get("request_url_query");
- }
-
- // As a last sanity check to make sure we're not throwing away data as
- // part of this migration, compare the original full URL string to the
- // new URL composed of the various parts.
- if (!value.getAsString().equals(reassmbledUrl)) {
- // Ignore some of the default ports for comparison.
- String compareUrl = value.getAsString().replaceFirst(":(80|443|50090)/", "/");
-
- // Ignore url encoding of the hash fragment.
- compareUrl = compareUrl.replaceFirst("#", "%23");
-
- // Ignore case-sensitivity on the Host.
- Pattern pattern = Pattern.compile("://(.+?)/");
- Matcher matcher = pattern.matcher(compareUrl);
- StringBuffer buffer = new StringBuffer();
- while (matcher.find()) {
- matcher.appendReplacement(buffer,
- Matcher.quoteReplacement("://" + matcher.group(1).toLowerCase() + "/"));
- }
- matcher.appendTail(buffer);
- compareUrl = buffer.toString();
-
- if (!compareUrl.equals(reassmbledUrl)) {
- logger.warn("request_url (" + value.getAsString() + " - " + compareUrl
- + ") does not match reassembled URL (" + reassmbledUrl + ")");
- }
- }
-
- value = null;
- break;
-
- // The following are some renamed fields to better normalize the new
- // storage schema.
- case "response_time":
- key = "timer_response";
- break;
- case "backend_response_time":
- key = "timer_backend_response";
- break;
- case "internal_gatekeeper_time":
- key = "timer_internal";
- break;
- case "proxy_overhead":
- key = "timer_proxy_overhead";
- break;
- case "gatekeeper_denied_code":
- key = "denied_reason";
- break;
- case "imported":
- key = "log_imported";
- break;
- }
-
- // Handle empty strings values.
- if (value != null && value.isJsonPrimitive() && value.getAsJsonPrimitive().isString()
- && value.getAsJsonPrimitive().getAsString().equals("")) {
- switch (key) {
- // Replace empty string request_ips with 127.0.0.1. There's not a lot
- // of these empty string values, but since this is considered a
- // required field moving forward, let's make sure we have at least
- // some real value in there.
- case "request_ip":
- logger.warn(
- key + " contains empty string value: replacing with 127.0.0.1 since NULLs are not allowed on this field: "
- + hit);
- value = new JsonPrimitive("127.0.0.1");
- break;
-
- // Set empty string values to null.
- //
- // I think in all of our cases, nulls are more appropriate, we just
- // have empty strings in some of our source data due to various
- // migrations. Kylin (v1.2) also seems to treat empty strings as NULLs
- // in the rollups, so for consistency sake, we'll avoid them.
- default:
- value = null;
- break;
- }
- }
-
- if (value != null) {
- // Set the value on the new record, performing type-casting as needed.
- try {
- Schema.Type type = schema.getFieldType(key);
- if (type == Schema.Type.INT) {
- log.put(key, value.getAsInt());
- } else if (type == Schema.Type.LONG) {
- log.put(key, value.getAsLong());
- } else if (type == Schema.Type.DOUBLE) {
- log.put(key, value.getAsDouble());
- } else if (type == Schema.Type.BOOLEAN) {
- log.put(key, value.getAsBoolean());
- } else {
- try {
- log.put(key, value.getAsString());
- } catch (IllegalStateException e) {
- // Handle some unexpected array types by comma-delimiting the
- // values.
- if (value.isJsonArray()) {
- StringBuffer buffer = new StringBuffer();
- Iterator iter = value.getAsJsonArray().iterator();
- while (iter.hasNext()) {
- buffer.append(iter.next().getAsString());
- if (iter.hasNext()) {
- buffer.append(", ");
- }
- }
- log.put(key, buffer.toString());
- } else {
- throw (e);
- }
- }
- }
- } catch (Exception e) {
- logger.error("Eror on field: " + key);
- logger.error(e.getMessage());
- throw (e);
- }
- }
- }
-
- OrcRow orcRecord = new OrcRow(schema.getNonPartitionFieldsList().size());
- for (int i = 0; i < schema.getNonPartitionFieldsList().size(); i++) {
- String field = schema.getNonPartitionFieldsList().get(i);
- Schema.Type type = schema.getFieldType(field);
- Object rawValue = log.get(field);
- Object value;
-
- if (rawValue != null) {
- if (type == Schema.Type.INT) {
- if (schema.isFieldTypeShort(field)) {
- value = new ShortWritable(((Integer) rawValue).shortValue());
- } else {
- value = new IntWritable((int) rawValue);
- }
- } else if (type == Schema.Type.LONG) {
- value = new LongWritable((long) rawValue);
- } else if (type == Schema.Type.DOUBLE) {
- value = new DoubleWritable((double) rawValue);
- } else if (type == Schema.Type.BOOLEAN) {
- value = new BooleanWritable((boolean) rawValue);
- } else if (type == Schema.Type.STRING) {
- value = new Text((String) rawValue);
- } else {
- throw new IOException("Unknown type: " + type.toString());
- }
-
- orcRecord.setFieldValue(i, value);
- }
- }
-
- this.getOrcWriter().addRow(orcRecord);
- } catch (Exception e) {
- logger.error("Error on hit: " + hit);
- logger.error(e.getMessage());
- throw (e);
- }
- }
-
- private DateTime parseTimestamp(JsonElement value) {
- DateTime date;
- if (value.getAsJsonPrimitive().isNumber()) {
- date = new DateTime(value.getAsLong(), DateTimeZone.UTC);
- } else {
- date = this.dateTimeParser.parseDateTime(value.getAsString());
- }
- return date.withZone(App.TIMEZONE);
- }
-}
diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcField.java b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcField.java
deleted file mode 100644
index 0f734484b..000000000
--- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcField.java
+++ /dev/null
@@ -1,37 +0,0 @@
-package apiumbrella.hadoop_analytics;
-
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.StructField;
-
-// From https://gist.github.com/omalley/ccabae7cccac28f64812
-public class OrcField implements StructField {
- private final String name;
- private final ObjectInspector inspector;
- final int offset;
-
- OrcField(String name, ObjectInspector inspector, int offset) {
- this.name = name;
- this.inspector = inspector;
- this.offset = offset;
- }
-
- @Override
- public String getFieldName() {
- return name;
- }
-
- @Override
- public ObjectInspector getFieldObjectInspector() {
- return inspector;
- }
-
- @Override
- public int getFieldID() {
- return offset;
- }
-
- @Override
- public String getFieldComment() {
- return null;
- }
-}
diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcRow.java b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcRow.java
deleted file mode 100644
index 9b0cb126c..000000000
--- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcRow.java
+++ /dev/null
@@ -1,22 +0,0 @@
-package apiumbrella.hadoop_analytics;
-
-// From https://gist.github.com/omalley/ccabae7cccac28f64812
-public class OrcRow {
- public Object[] columns;
-
- OrcRow(int colCount) {
- columns = new Object[colCount];
- }
-
- void setFieldValue(int FieldIndex, Object value) {
- columns[FieldIndex] = value;
- }
-
- void setNumFields(int newSize) {
- if (newSize != columns.length) {
- Object[] oldColumns = columns;
- columns = new Object[newSize];
- System.arraycopy(oldColumns, 0, columns, 0, oldColumns.length);
- }
- }
-}
diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcRowInspector.java b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcRowInspector.java
deleted file mode 100644
index 829daa350..000000000
--- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/OrcRowInspector.java
+++ /dev/null
@@ -1,121 +0,0 @@
-package apiumbrella.hadoop_analytics;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.StructField;
-
-// From https://gist.github.com/omalley/ccabae7cccac28f64812
-public class OrcRowInspector extends SettableStructObjectInspector {
- private List fields;
-
- public OrcRowInspector(List fields) {
- super();
- this.fields = fields;
- }
-
- @Override
- public List getAllStructFieldRefs() {
- return fields;
- }
-
- @Override
- public StructField getStructFieldRef(String s) {
- for (StructField field : fields) {
- if (field.getFieldName().equalsIgnoreCase(s)) {
- return field;
- }
- }
- return null;
- }
-
- @Override
- public Object getStructFieldData(Object object, StructField field) {
- if (object == null) {
- return null;
- }
- int offset = ((OrcField) field).offset;
- OrcRow struct = (OrcRow) object;
- if (offset >= struct.columns.length) {
- return null;
- }
-
- return struct.columns[offset];
- }
-
- @Override
- public List getStructFieldsDataAsList(Object object) {
- if (object == null) {
- return null;
- }
- OrcRow struct = (OrcRow) object;
- List result = new ArrayList(struct.columns.length);
- for (Object child : struct.columns) {
- result.add(child);
- }
- return result;
- }
-
- @Override
- public String getTypeName() {
- StringBuilder buffer = new StringBuilder();
- buffer.append("struct<");
- for (int i = 0; i < fields.size(); ++i) {
- StructField field = fields.get(i);
- if (i != 0) {
- buffer.append(",");
- }
- buffer.append(field.getFieldName());
- buffer.append(":");
- buffer.append(field.getFieldObjectInspector().getTypeName());
- }
- buffer.append(">");
- return buffer.toString();
- }
-
- @Override
- public Category getCategory() {
- return Category.STRUCT;
- }
-
- @Override
- public Object create() {
- return new OrcRow(0);
- }
-
- @Override
- public Object setStructFieldData(Object struct, StructField field, Object fieldValue) {
- OrcRow orcStruct = (OrcRow) struct;
- int offset = ((OrcField) field).offset;
- // if the offset is bigger than our current number of fields, grow it
- if (orcStruct.columns.length <= offset) {
- orcStruct.setNumFields(offset + 1);
- }
- orcStruct.setFieldValue(offset, fieldValue);
- return struct;
- }
-
- @Override
- public boolean equals(Object o) {
- if (o == null || o.getClass() != getClass()) {
- return false;
- } else if (o == this) {
- return true;
- } else {
- List other = ((OrcRowInspector) o).fields;
- if (other.size() != fields.size()) {
- return false;
- }
- for (int i = 0; i < fields.size(); ++i) {
- StructField left = other.get(i);
- StructField right = fields.get(i);
- if (!(left.getFieldName().equalsIgnoreCase(right.getFieldName())
- && left.getFieldObjectInspector().equals(right.getFieldObjectInspector()))) {
- return false;
- }
- }
- return true;
- }
- }
-}
diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/resources/simplelogger.properties b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/resources/simplelogger.properties
deleted file mode 100644
index 780ec143c..000000000
--- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/resources/simplelogger.properties
+++ /dev/null
@@ -1,6 +0,0 @@
-org.slf4j.simpleLogger.logFile=log/elasticsearch-import.log
-org.slf4j.simpleLogger.defaultLogLevel=warn
-org.slf4j.simpleLogger.log.apiumbrella=info
-org.slf4j.simpleLogger.showDateTime=true
-org.slf4j.simpleLogger.dateTimeFormat=yyyy-MM-dd'T'HH:mm:ss.SSSZ"
-org.slf4j.simpleLogger.showLogName=false
\ No newline at end of file
diff --git a/src/api-umbrella/hadoop-analytics/pom.xml b/src/api-umbrella/hadoop-analytics/pom.xml
deleted file mode 100644
index 85f4f5905..000000000
--- a/src/api-umbrella/hadoop-analytics/pom.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-
- 4.0.0
-
- api-umbrella.hadoop-analytics
- hadoop-analytics
- 0.0.1-SNAPSHOT
- pom
-
- hadoop-analytics
- http://maven.apache.org
-
-
- UTF-8
- target
-
-
-
-
-
-
- ${buildDir}
-
-
-
- elasticsearch-import
- processor
- schema
-
-
diff --git a/src/api-umbrella/hadoop-analytics/processor/.gitignore b/src/api-umbrella/hadoop-analytics/processor/.gitignore
deleted file mode 100644
index 7e5babc1e..000000000
--- a/src/api-umbrella/hadoop-analytics/processor/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-/.classpath
-/.project
-/.settings
-/dependency-reduced-pom.xml
-/target/
diff --git a/src/api-umbrella/hadoop-analytics/processor/pom.xml b/src/api-umbrella/hadoop-analytics/processor/pom.xml
deleted file mode 100644
index b4a144045..000000000
--- a/src/api-umbrella/hadoop-analytics/processor/pom.xml
+++ /dev/null
@@ -1,107 +0,0 @@
-
- 4.0.0
-
-
- api-umbrella.hadoop-analytics
- hadoop-analytics
- 0.0.1-SNAPSHOT
-
- processor
- jar
-
- processor
- http://maven.apache.org
-
-
- UTF-8
-
-
-
-
-
- api-umbrella.hadoop-analytics
- schema
- 0.0.1-SNAPSHOT
-
-
-
- joda-time
- joda-time
- 2.9.2
-
-
-
- org.slf4j
- slf4j-log4j12
- 1.7.21
-
-
-
- org.apache.hive
- hive-jdbc
- 0.14.0
-
-
-
- org.apache.hadoop
- hadoop-client
- 2.6.0
-
-
-
- org.quartz-scheduler
- quartz
- 2.2.3
-
-
- org.quartz-scheduler
- quartz-jobs
- 2.2.3
-
-
-
-
- ${buildDir}/processor
-
-
-
- org.apache.maven.plugins
- maven-shade-plugin
- 2.4.3
-
-
- package
-
- shade
-
-
-
-
-
- apiumbrella.hadoop_analytics.App
-
-
-
-
-
- *:*
-
- META-INF/*.SF
- META-INF/*.DSA
- META-INF/*.RSA
-
-
-
-
-
-
-
-
-
-
diff --git a/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/App.java b/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/App.java
deleted file mode 100644
index 26672a1cb..000000000
--- a/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/App.java
+++ /dev/null
@@ -1,210 +0,0 @@
-package apiumbrella.hadoop_analytics;
-
-import java.io.IOException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.sql.Connection;
-import java.sql.DriverManager;
-import java.sql.PreparedStatement;
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocatedFileStatus;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.RemoteIterator;
-import org.joda.time.DateTime;
-import org.joda.time.DateTimeZone;
-import org.joda.time.format.DateTimeFormatter;
-import org.joda.time.format.ISODateTimeFormat;
-import org.quartz.CronScheduleBuilder;
-import org.quartz.JobBuilder;
-import org.quartz.JobDetail;
-import org.quartz.Scheduler;
-import org.quartz.SchedulerException;
-import org.quartz.SimpleScheduleBuilder;
-import org.quartz.Trigger;
-import org.quartz.TriggerBuilder;
-import org.quartz.impl.StdSchedulerFactory;
-import org.slf4j.Logger;
-
-public class App {
- private static boolean DISABLE_LIVE_DATA_CONVERSION =
- Boolean.getBoolean("apiumbrella.disable_live_data_conversion");
- private static boolean DISABLE_KYLIN_REFRESH =
- Boolean.getBoolean("apiumbrella.disable_kylin_refresh");
- protected static DateTimeZone TIMEZONE =
- DateTimeZone.forID(System.getProperty("apiumbrella.timezone", "UTC"));
- protected static final String HDFS_URI =
- System.getProperty("apiumbrella.hdfs_uri", "hdfs://127.0.0.1:8020");
- protected static final String HDFS_ROOT = "/apps/api-umbrella";
- protected static final String HDFS_LOGS_ROOT = HDFS_ROOT + "/logs";
- protected static final String HDFS_LOGS_LIVE_ROOT = App.HDFS_ROOT + "/logs-live";
- protected static final String LOGS_TABLE_NAME = "api_umbrella.logs";
- protected static final String LOGS_LIVE_TABLE_NAME = "api_umbrella.logs_live";
-
- public void run() throws SchedulerException {
- Scheduler scheduler = StdSchedulerFactory.getDefaultScheduler();
- scheduler.start();
-
- if (!DISABLE_LIVE_DATA_CONVERSION) {
- /*
- * Job to convert the live log data (coming from Kafka & Flume) into the ORC files for
- * querying and long-term storage.
- */
- JobDetail convertJob = JobBuilder.newJob(ConvertLiveDataToOrc.class)
- .withIdentity("convertLiveDataJob").usingJobData("lastMigratedPartitionTime", 0L)
- .usingJobData("lastConcatenateTime", 0L).usingJobData("tablesCreated", false).build();
- /* Run every 30 seconds */
- Trigger convertTrigger =
- TriggerBuilder.newTrigger().withIdentity("convertLiveDataTrigger").startNow()
- .withSchedule(
- SimpleScheduleBuilder.simpleSchedule().withIntervalInSeconds(30).repeatForever())
- .build();
- scheduler.scheduleJob(convertJob, convertTrigger);
- }
-
- if (!DISABLE_KYLIN_REFRESH) {
- /* Job to process the new log data into the pre-aggregated Kylin cubes. */
- JobDetail refreshJob = JobBuilder.newJob(RefreshKylin.class).withIdentity("refreshKylinJob")
- .storeDurably().build();
- /*
- * Run this job once immediately on boot to catch up with any new data that needs processing.
- */
- Trigger refreshImmediateTrigger = TriggerBuilder.newTrigger()
- .withIdentity("refreshKylinImmediateTrigger").forJob(refreshJob).startNow()
- .withSchedule(SimpleScheduleBuilder.simpleSchedule()).build();
- /*
- * After the initial run, run this every morning at 01:00 in the timezone used for data. This
- * assumes that by 01:00 there will no longer be any data writing to the previous day's
- * partition, so it's safe to fully process that day into Kylin.
- */
- Trigger refreshDailyTrigger = TriggerBuilder.newTrigger()
- .withIdentity("refreshKylinDailyTrigger").forJob(refreshJob).startNow()
- .withSchedule(
- CronScheduleBuilder.dailyAtHourAndMinute(1, 0).inTimeZone(TIMEZONE.toTimeZone()))
- .build();
- scheduler.addJob(refreshJob, false);
- scheduler.scheduleJob(refreshImmediateTrigger);
- scheduler.scheduleJob(refreshDailyTrigger);
- }
-
- scheduler.start();
- }
-
- public static void main(String[] args) throws SchedulerException {
- if (System.getProperty("apiumbrella.root_log_level") == null) {
- System.setProperty("apiumbrella.root_log_level", "WARN");
- }
-
- if (System.getProperty("apiumbrella.log_level") == null) {
- System.setProperty("apiumbrella.log_level", "INFO");
- }
-
- App app = new App();
- app.run();
- }
-
- public static FileSystem getHadoopFileSystem() throws IOException, URISyntaxException {
- Configuration conf = new Configuration();
- conf.setInt("ipc.client.connect.max.retries.on.timeouts", 3);
- /* Fix for hadoop jar ordering: http://stackoverflow.com/a/21118824 */
- conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
- conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
-
- return FileSystem.get(new URI(HDFS_URI), conf);
- }
-
- public static Connection getHiveConnection() throws ClassNotFoundException, SQLException {
- /* Load the Hive JDBC class. */
- Class.forName("org.apache.hive.jdbc.HiveDriver");
-
- return DriverManager.getConnection("jdbc:hive2://localhost:10000/api_umbrella", "hive", "");
- }
-
- /**
- * Find any partitions in the api_umbrella.logs table that are composed of more than 1 file and
- * concatenate the partitions to merge them down to 1 file. This helps improve query performance
- * on the historical data.
- *
- * @param logger
- * @throws ClassNotFoundException
- * @throws SQLException
- * @throws IOException
- * @throws IllegalArgumentException
- * @throws URISyntaxException
- */
- public static synchronized void concatenateTablePartitions(Logger logger)
- throws ClassNotFoundException, SQLException, IOException, IllegalArgumentException,
- URISyntaxException {
- logger.debug("Begin concatenate table partitions");
-
- /* Build the CONCATENATE SQL command. */
- LogSchema logSchema = new LogSchema();
- StringBuilder sql = new StringBuilder();
- sql.append("ALTER TABLE " + App.LOGS_TABLE_NAME + " PARTITION(");
- for (int i = 0; i < logSchema.getPartitionFieldsList().size(); i++) {
- if (i > 0) {
- sql.append(",");
- }
- sql.append(logSchema.getPartitionFieldsList().get(i) + "=?");
- }
- sql.append(") CONCATENATE");
-
- /*
- * Iterate over all the partition files to detect partitions that contain more than 1 file.
- */
- RemoteIterator filesIter =
- getHadoopFileSystem().listFiles(new Path(HDFS_URI + HDFS_LOGS_ROOT), true);
- Pattern partitionDatePattern = Pattern.compile(".*timestamp_tz_date=(\\d{4}-\\d{2}-\\d{2})");
- HashMap partitionFileCounts = new HashMap();
- while (filesIter.hasNext()) {
- FileStatus file = filesIter.next();
- Path partition = file.getPath().getParent();
- Matcher matcher = partitionDatePattern.matcher(partition.toString());
- DateTime partitionTime = null;
- while (matcher.find()) {
- partitionTime = new DateTime(matcher.group(1) + "T00:00:00");
- }
-
- Integer count = partitionFileCounts.get(partitionTime);
- if (count == null) {
- count = 0;
- }
- partitionFileCounts.put(partitionTime, count + 1);
- }
-
- /*
- * Iterate over all the partitions with more than 1 file and run the concatenate command against
- * them.
- */
- Connection connection = null;
- DateTimeFormatter dateFormatter = ISODateTimeFormat.date();
- for (Map.Entry entry : partitionFileCounts.entrySet()) {
- DateTime partition = entry.getKey();
- Integer count = entry.getValue();
- logger.debug("Partition " + partition + ": " + count + " files");
- if (count > 1) {
- if (connection == null) {
- connection = getHiveConnection();
- }
-
- logger.info("Concatenating table partition " + partition + " (" + count + " files)");
- PreparedStatement concatenate = connection.prepareStatement(sql.toString());
- concatenate.setString(1, dateFormatter.print(partition.withDayOfYear(1)));
- concatenate.setString(2, dateFormatter.print(partition.withDayOfMonth(1)));
- concatenate.setString(3, dateFormatter.print(partition.withDayOfWeek(1)));
- concatenate.setString(4, dateFormatter.print(partition));
- concatenate.executeUpdate();
- concatenate.close();
- }
- }
-
- logger.debug("Finish concatenate table partitions");
- }
-}
diff --git a/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/ConvertLiveDataToOrc.java b/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/ConvertLiveDataToOrc.java
deleted file mode 100644
index d702d1a4e..000000000
--- a/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/ConvertLiveDataToOrc.java
+++ /dev/null
@@ -1,496 +0,0 @@
-package apiumbrella.hadoop_analytics;
-
-import java.io.BufferedWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.net.URISyntaxException;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.TreeSet;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocatedFileStatus;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.RemoteIterator;
-import org.joda.time.DateTime;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-import org.joda.time.format.ISODateTimeFormat;
-import org.quartz.DisallowConcurrentExecution;
-import org.quartz.Job;
-import org.quartz.JobDataMap;
-import org.quartz.JobExecutionContext;
-import org.quartz.JobExecutionException;
-import org.quartz.PersistJobDataAfterExecution;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This class is responsible for migrating the live log data into permanent ORC storage (with likely
- * a 2-3 minute delay).
- *
- * Live log data is sent by the application servers in the following fashion:
- *
- * [nginx] --json--> [rsyslog] --json--> [flume] --json--> [hdfs]
- *
- * Flume stores the JSON files in directories partitioned by minute. This background job then runs
- * periodically to migrate this per-minute JSON data into the permanent ORC storage. The data is
- * migrated by appending each minute's data to the ORC table, which is partitioned by day. With this
- * approach, it may take around 2-3 minutes for the data to become populated in the ORC table (since
- * we have to wait until we're sure the minute partition is no longer being written to).
- *
- * Note: A cleaner and more live solution would involve populating the ORC table directly using Hive
- * transaction streaming: https://cwiki.apache.org/confluence/display/Hive/Streaming+Data+Ingest
- * However, since we're on Hive 0.14 (for Kylin & HDP 2.2 compatibility), there are several bugs in
- * the Hive streaming implementation that impact us
- * (https://issues.apache.org/jira/browse/HIVE-8966,
- * https://issues.apache.org/jira/browse/HIVE-11540,
- * https://issues.apache.org/jira/browse/HIVE-5143), and Flume's Hive integration is also currently
- * experimental (I've run into some issues and high memory use with it). So for now, we'll use this
- * more manual approach that can work with the older version of Hive, but in the future this may be
- * worth revisiting.
- */
-@DisallowConcurrentExecution
-@PersistJobDataAfterExecution
-public class ConvertLiveDataToOrc implements Job {
- private static final Path LAST_MIGRATED_MARKER =
- new Path(App.HDFS_URI + App.HDFS_ROOT + "/.logs-live-last-migrated-partition-time");
- final Logger logger;
-
- Pattern partitionDatePattern = Pattern.compile(
- ".*timestamp_tz_date=(\\d{4}-\\d{2}-\\d{2})/timestamp_tz_hour_minute=(\\d{2})-(\\d{2}).*");
- DateTimeFormatter dateFormatter = ISODateTimeFormat.date();
- DateTimeFormatter hourMinuteFormatter = DateTimeFormat.forPattern("HH-mm");
- private static LogSchema logSchema = new LogSchema();
- private static String createExternalTableSql;
- private static String createLiveExternalTableSql;
- private static String addPartitionSql;
- private static String addLivePartitionSql;
- private static String dropLivePartitionSql;
- private static String insertAppendSql;
- private static String insertOverwriteSql;
- private FileSystem fileSystem;
- private long lastMigratedPartitionTime = 0;
-
- public ConvertLiveDataToOrc() {
- logger = LoggerFactory.getLogger(this.getClass());
- logger.debug("Initializing " + this.getClass());
- }
-
- public void execute(JobExecutionContext context) throws JobExecutionException {
- try {
- run(context);
- } catch (Exception e) {
- logger.error("Convert live data to ORC error", e);
- JobExecutionException jobError = new JobExecutionException(e);
- throw jobError;
- }
- }
-
- public void run(JobExecutionContext context) throws SQLException, IOException,
- ClassNotFoundException, IllegalArgumentException, URISyntaxException {
- logger.debug("Begin processing");
-
- JobDataMap jobData = context.getJobDetail().getJobDataMap();
- lastMigratedPartitionTime = jobData.getLong("lastMigratedPartitionTime");
-
- fileSystem = App.getHadoopFileSystem();
- if (lastMigratedPartitionTime == 0) {
- if (fileSystem.exists(LAST_MIGRATED_MARKER)) {
- FSDataInputStream markerInputStream = fileSystem.open(LAST_MIGRATED_MARKER);
- lastMigratedPartitionTime = Long.parseLong(IOUtils.toString(markerInputStream), 10);
- logger.debug("Read last migrated partition timestamp marker: " + lastMigratedPartitionTime);
- }
- }
-
- Connection connection = null;
- try {
- connection = App.getHiveConnection();
- createTables(jobData, connection);
-
- for (DateTime partition : getInactivePartitions()) {
- if (partition.getMillis() > lastMigratedPartitionTime) {
- logger.info("Migrating partition: " + partition);
-
- PreparedStatement addPartition = connection.prepareStatement(getAddPartitionSql());
- addPartition.setString(1, dateFormatter.print(partition.withDayOfYear(1)));
- addPartition.setString(2, dateFormatter.print(partition.withDayOfMonth(1)));
- addPartition.setString(3, dateFormatter.print(partition.withDayOfWeek(1)));
- addPartition.setString(4, dateFormatter.print(partition));
- addPartition.executeUpdate();
- addPartition.close();
-
- PreparedStatement addLivePartition =
- connection.prepareStatement(getAddLivePartitionSql());
- addLivePartition.setString(1, dateFormatter.print(partition));
- addLivePartition.setString(2, hourMinuteFormatter.print(partition));
- addLivePartition.executeUpdate();
- addLivePartition.close();
-
- PreparedStatement migrate = connection.prepareStatement(getInsertAppendSql());
- migrate.setString(1, dateFormatter.print(partition.withDayOfYear(1)));
- migrate.setString(2, dateFormatter.print(partition.withDayOfMonth(1)));
- migrate.setString(3, dateFormatter.print(partition.withDayOfWeek(1)));
- migrate.setString(4, dateFormatter.print(partition));
- migrate.setString(5, dateFormatter.print(partition));
- migrate.setString(6, hourMinuteFormatter.print(partition));
- migrate.executeUpdate();
- migrate.close();
-
- lastMigratedPartitionTime = partition.getMillis();
- jobData.put("lastMigratedPartitionTime", lastMigratedPartitionTime);
-
- FSDataOutputStream markerOutputStream = fileSystem.create(LAST_MIGRATED_MARKER, true);
- BufferedWriter br = new BufferedWriter(new OutputStreamWriter(markerOutputStream));
- br.write(Long.toString(lastMigratedPartitionTime));
- br.close();
- } else {
- logger.debug("Skipping already processed partition: " + partition);
- }
- }
-
- jobData.put("lastMigratedPartitionTime", lastMigratedPartitionTime);
-
- /*
- * Since we're appending the live data to the ORC files every minute, we end up with many
- * small ORC files in the api_umbrella.logs partitions. To help improve query performance,
- * concatenate the table partitions ever 1 hour (which merges the existing files down to a
- * single file).
- */
- long now = DateTime.now().getMillis();
- long lastConcatenateTime = jobData.getLong("lastConcatenateTime");
- if (now - lastConcatenateTime > 1 * 60 * 60 * 1000) {
- App.concatenateTablePartitions(logger);
- jobData.put("lastConcatenateTime", now);
- }
- } finally {
- if (connection != null) {
- connection.close();
- }
- }
-
- logger.debug("Finish processing");
- }
-
- /**
- * Create either of the external tables
- *
- * @param connection
- * @throws SQLException
- */
- private void createTables(JobDataMap jobData, Connection connection) throws SQLException {
- boolean tablesCreated = jobData.getBoolean("tablesCreated");
- if (tablesCreated == false) {
- Statement statement = connection.createStatement();
-
- logger.info("Creating table (if not exists): " + App.LOGS_TABLE_NAME);
- statement.executeUpdate(getCreateExternalTableSql());
-
- logger.info("Creating table (if not exists): " + App.LOGS_LIVE_TABLE_NAME);
- statement.executeUpdate(getCreateLiveExternalTableSql());
-
- statement.close();
- jobData.put("tablesCreated", true);
- }
- }
-
- /**
- * Generate the SQL string for creating the external "api_umbrella.logs" table in Hive. This is
- * the table that stores all the data in the definitive format used for queries and processing.
- *
- * The columns are generated based on the Avro schema definition (src/main/resources/log.avsc) for
- * our log data to ensure the schemas match across the different tables.
- *
- * @return
- */
- private static String getCreateExternalTableSql() {
- if (createExternalTableSql == null) {
- StringBuilder sql = new StringBuilder();
- sql.append("CREATE EXTERNAL TABLE IF NOT EXISTS " + App.LOGS_TABLE_NAME + "(");
- for (int i = 0; i < logSchema.getNonPartitionFieldsList().size(); i++) {
- if (i > 0) {
- sql.append(",");
- }
- String field = logSchema.getNonPartitionFieldsList().get(i);
- sql.append(field + " " + logSchema.getFieldHiveType(field));
- }
- sql.append(") ");
-
- sql.append("PARTITIONED BY(");
- for (int i = 0; i < logSchema.getPartitionFieldsList().size(); i++) {
- if (i > 0) {
- sql.append(",");
- }
- String field = logSchema.getPartitionFieldsList().get(i);
- sql.append(field + " " + logSchema.getFieldHiveType(field));
- }
- sql.append(") ");
-
- sql.append("STORED AS ORC LOCATION '" + App.HDFS_LOGS_ROOT + "'");
-
- createExternalTableSql = sql.toString();
- }
-
- return createExternalTableSql;
- }
-
- /**
- * Generate the SQL string for creating the external "api_umbrella.logs_live" table in Hive. This
- * is the table that stores temporary data, partitioned by minute, that Flume is writing to HDFS
- * JSON files to.
- *
- * The columns are generated based on the Avro schema definition (src/main/resources/log.avsc) for
- * our log data to ensure the schemas match across the different tables.
- *
- * @return
- */
- private static String getCreateLiveExternalTableSql() {
- if (createLiveExternalTableSql == null) {
- StringBuilder sql = new StringBuilder();
- sql.append("CREATE EXTERNAL TABLE IF NOT EXISTS " + App.LOGS_LIVE_TABLE_NAME + "(");
- for (int i = 0; i < logSchema.getLiveNonPartitionFieldsList().size(); i++) {
- if (i > 0) {
- sql.append(",");
- }
- String field = logSchema.getLiveNonPartitionFieldsList().get(i);
- sql.append(field + " " + logSchema.getFieldHiveType(field));
- }
- sql.append(") ");
-
- sql.append("PARTITIONED BY(");
- for (int i = 0; i < logSchema.getLivePartitionFieldsList().size(); i++) {
- if (i > 0) {
- sql.append(",");
- }
- String field = logSchema.getLivePartitionFieldsList().get(i);
- sql.append(field + " " + logSchema.getFieldHiveType(field));
- }
- sql.append(") ");
-
- sql.append("ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' ");
- sql.append("STORED AS TEXTFILE LOCATION '" + App.HDFS_LOGS_LIVE_ROOT + "'");
-
- createLiveExternalTableSql = sql.toString();
- }
-
- return createLiveExternalTableSql;
- }
-
- /**
- * Generate the SQL string for adding new date partitions to the "api_umbrella.logs" table.
- *
- * @return
- */
- private static String getAddPartitionSql() {
- if (addPartitionSql == null) {
- StringBuilder sql = new StringBuilder();
- sql.append("ALTER TABLE " + App.LOGS_TABLE_NAME + " ADD IF NOT EXISTS ");
- sql.append("PARTITION(");
- for (int i = 0; i < logSchema.getPartitionFieldsList().size(); i++) {
- if (i > 0) {
- sql.append(",");
- }
- sql.append(logSchema.getPartitionFieldsList().get(i) + "=?");
- }
- sql.append(")");
-
- addPartitionSql = sql.toString();
- }
-
- return addPartitionSql;
- }
-
- /**
- * Generate the SQL string for adding new minute partitions to the "api_umbrella.logs_live" table.
- *
- * @return
- */
- private static String getAddLivePartitionSql() {
- if (addLivePartitionSql == null) {
- StringBuilder sql = new StringBuilder();
- sql.append("ALTER TABLE " + App.LOGS_LIVE_TABLE_NAME + " ADD IF NOT EXISTS ");
- sql.append("PARTITION(");
- for (int i = 0; i < logSchema.getLivePartitionFieldsList().size(); i++) {
- if (i > 0) {
- sql.append(",");
- }
- sql.append(logSchema.getLivePartitionFieldsList().get(i) + "=?");
- }
- sql.append(")");
-
- addLivePartitionSql = sql.toString();
- }
-
- return addLivePartitionSql;
- }
-
- protected static String getDropLivePartitionsDaySql() {
- if (dropLivePartitionSql == null) {
- StringBuilder sql = new StringBuilder();
- sql.append("ALTER TABLE " + App.LOGS_TABLE_NAME + " DROP IF EXISTS ");
- sql.append("PARTITION(timestamp_tz_date=?)");
-
- dropLivePartitionSql = sql.toString();
- }
-
- return dropLivePartitionSql;
- }
-
- /**
- * Generate the SQL string that copies a minute worth of data from the "api_umbrella.logs_live"
- * table (JSON storage) to the real "api_umbrella.logs" table (ORC storage).
- *
- * Data is appended to the ORC table's full-day partition, so it's expected that this will run
- * repeatedly for each new minute of data, but the same minute should not be repeated (or else the
- * same data will be appended twice to the ORC table).
- *
- * @return
- */
- private static String getInsertAppendSql() {
- if (insertAppendSql == null) {
- StringBuilder sql = new StringBuilder();
- sql.append("INSERT INTO TABLE " + App.LOGS_TABLE_NAME + " ");
-
- sql.append("PARTITION(");
- for (int i = 0; i < logSchema.getPartitionFieldsList().size(); i++) {
- if (i > 0) {
- sql.append(",");
- }
- sql.append(logSchema.getPartitionFieldsList().get(i) + "=?");
- }
- sql.append(") ");
-
- sql.append("SELECT ");
- for (int i = 0; i < logSchema.getNonPartitionFieldsList().size(); i++) {
- if (i > 0) {
- sql.append(",");
- }
- sql.append(logSchema.getNonPartitionFieldsList().get(i));
- }
-
- sql.append(" FROM " + App.LOGS_LIVE_TABLE_NAME + " WHERE ");
- for (int i = 0; i < logSchema.getLivePartitionFieldsList().size(); i++) {
- if (i > 0) {
- sql.append(" AND ");
- }
- sql.append(logSchema.getLivePartitionFieldsList().get(i) + "=?");
- }
- sql.append(" ORDER BY timestamp_utc, id");
-
- insertAppendSql = sql.toString();
- }
-
- return insertAppendSql;
- }
-
- protected static String getInsertOverwriteDaySql() {
- if (insertOverwriteSql == null) {
- StringBuilder sql = new StringBuilder();
- sql.append("INSERT OVERWRITE TABLE " + App.LOGS_TABLE_NAME + " ");
-
- sql.append("PARTITION(");
- for (int i = 0; i < logSchema.getPartitionFieldsList().size(); i++) {
- if (i > 0) {
- sql.append(",");
- }
- sql.append(logSchema.getPartitionFieldsList().get(i) + "=?");
- }
- sql.append(") ");
-
- sql.append("SELECT DISTINCT ");
- for (int i = 0; i < logSchema.getNonPartitionFieldsList().size(); i++) {
- if (i > 0) {
- sql.append(",");
- }
- sql.append(logSchema.getNonPartitionFieldsList().get(i));
- }
-
- sql.append(" FROM " + App.LOGS_LIVE_TABLE_NAME + " WHERE timestamp_tz_date=?");
- sql.append(" ORDER BY timestamp_utc, id");
-
- insertOverwriteSql = sql.toString();
- }
-
- return insertOverwriteSql;
- }
-
- /**
- * Return all the minute partitions within the "api-umbrella.logs_live" table that are no longer
- * being written to.
- *
- * This is determined by looking at all of the underlying HDFS files that Flume is writing to and
- * seeing which partitions haven't been touched in over 1 minute.
- *
- * @return
- * @throws IOException
- */
- private TreeSet getInactivePartitions() throws IOException {
- RemoteIterator filesIter =
- fileSystem.listFiles(new Path(App.HDFS_URI + App.HDFS_LOGS_LIVE_ROOT), true);
-
- TreeSet inactivePartitions = new TreeSet();
- TreeSet activePartitions = new TreeSet();
- long inactiveTimeThreshold = new DateTime().minusMinutes(1).getMillis();
- while (filesIter.hasNext()) {
- FileStatus file = filesIter.next();
- Path partition = file.getPath().getParent();
- Matcher matcher = partitionDatePattern.matcher(partition.toString());
- DateTime partitionTime = null;
- while (matcher.find()) {
- partitionTime = new DateTime(
- matcher.group(1) + "T" + matcher.group(2) + ":" + matcher.group(3) + ":00");
- }
-
- /*
- * If the file was recently written to, mark the partition as active (since we don't want to
- * touch it until we're sure no more data will show up in it).
- */
- if (file.getModificationTime() > inactiveTimeThreshold) {
- activePartitions.add(partitionTime);
- } else {
- /*
- * If the file is older, but currently being written to (denoted by Flume prefixing the file
- * with an "_" so it's ignored from Hive), then we also want to ignore it. However, this
- * case should be rare, since it would indicate Flume is writing to an older file than we
- * expect.
- */
- if (file.getPath().getName().startsWith("_")) {
- activePartitions.add(partitionTime);
- } else {
- inactivePartitions.add(partitionTime);
- }
- }
- }
-
- /*
- * Loop through all the active partitions and ensure that none of these are also marked as
- * inactive. This handles situations where the partition might have some older files, but other
- * files still being written to.
- */
- for (DateTime activePartition : activePartitions) {
- logger.debug("Active partition being written to: " + activePartition);
- inactivePartitions.remove(activePartition);
-
- /*
- * If data is unexpectedly coming in out-of-order, log this as an error, since this must
- * manually be dealt with.
- */
- if (activePartition.getMillis() < lastMigratedPartitionTime) {
- logger.warn("Partition with active files is unexpectedly older than the most recently "
- + "processed data. This means older data is being populated in the live data files. "
- + "This should be resolved during the nightly OVERWRITE refresh process. "
- + "Partition: " + activePartition);
- }
- }
-
- return inactivePartitions;
- }
-}
diff --git a/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/RefreshKylin.java b/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/RefreshKylin.java
deleted file mode 100644
index ae9f2195d..000000000
--- a/src/api-umbrella/hadoop-analytics/processor/src/main/java/apiumbrella/hadoop_analytics/RefreshKylin.java
+++ /dev/null
@@ -1,439 +0,0 @@
-package apiumbrella.hadoop_analytics;
-
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.TreeSet;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.HttpException;
-import org.apache.commons.httpclient.HttpMethod;
-import org.apache.commons.httpclient.UsernamePasswordCredentials;
-import org.apache.commons.httpclient.auth.AuthScope;
-import org.apache.commons.httpclient.methods.GetMethod;
-import org.apache.commons.httpclient.methods.PutMethod;
-import org.apache.commons.httpclient.methods.RequestEntity;
-import org.apache.commons.httpclient.methods.StringRequestEntity;
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.LocatedFileStatus;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.RemoteIterator;
-import org.joda.time.DateTime;
-import org.joda.time.DateTimeZone;
-import org.joda.time.Days;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-import org.joda.time.format.ISODateTimeFormat;
-import org.quartz.DisallowConcurrentExecution;
-import org.quartz.Job;
-import org.quartz.JobExecutionContext;
-import org.quartz.JobExecutionException;
-import org.quartz.PersistJobDataAfterExecution;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.gson.JsonArray;
-import com.google.gson.JsonElement;
-import com.google.gson.JsonObject;
-import com.google.gson.JsonParser;
-import com.google.gson.JsonSyntaxException;
-
-@DisallowConcurrentExecution
-@PersistJobDataAfterExecution
-public class RefreshKylin implements Job {
- final Logger logger;
- private static final String KYLIN_URL =
- System.getProperty("apiumbrella.kylin_url", "http://127.0.0.1:7070/kylin");
- private static final String KYLIN_USERNAME =
- System.getProperty("apiumbrella.kylin_username", "ADMIN");
- private static final String KYLIN_PASSWORD =
- System.getProperty("apiumbrella.kylin_password", "KYLIN");
- private static final String KYLIN_REFRESH_END_DATE =
- System.getProperty("apiumbrella.kylin_refresh_end_date");
- private static final String CUBE_NAME = "logs_cube";
- DateTimeFormatter dateFormatter = ISODateTimeFormat.date();
- DateTimeFormatter hourMinuteFormatter = DateTimeFormat.forPattern("HH-mm");
- DateTimeFormatter dateTimeFormatter =
- DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(App.TIMEZONE);
- private HttpClient client;
- private Connection connection;
-
- public RefreshKylin() {
- logger = LoggerFactory.getLogger(this.getClass());
- logger.debug("Initializing " + this.getClass());
- }
-
- public void execute(JobExecutionContext context) throws JobExecutionException {
- try {
- connection = null;
- try {
- connection = App.getHiveConnection();
- run();
- } finally {
- if (connection != null) {
- connection.close();
- }
- }
- } catch (Exception e) {
- logger.error("Refresh kylin error", e);
- try {
- Thread.sleep(30000);
- } catch (InterruptedException e1) {
- e1.printStackTrace();
- }
- JobExecutionException jobError = new JobExecutionException(e);
- jobError.refireImmediately();
- throw jobError;
- }
- }
-
- public void run() throws JsonSyntaxException, HttpException, IOException, SQLException,
- ClassNotFoundException, IllegalArgumentException, URISyntaxException, InterruptedException {
- logger.info("Begin kylin refresh");
-
- URL url = new URL(KYLIN_URL);
- client = new HttpClient();
- UsernamePasswordCredentials credentials =
- new UsernamePasswordCredentials(KYLIN_USERNAME, KYLIN_PASSWORD);
- AuthScope authScope = new AuthScope(url.getHost(), url.getPort(), AuthScope.ANY_REALM);
- client.getState().setCredentials(authScope, credentials);
- client.getParams().setAuthenticationPreemptive(true);
-
- /* Determine the last table partition with data in it. */
- DateTime lastPartitionDayStart = getPartitions(App.LOGS_TABLE_NAME).last();
- if (lastPartitionDayStart == null) {
- logger.info("No data partitions exist, skipping kylin refresh");
- return;
- }
-
- /*
- * Determine how far Kylin should process data until. It should process until the most recent
- * partition with data, but never process today's current data (we will defer processing each
- * day until the day is over, so we don't have to worry about refreshing data).
- */
- DateTime processUntilDayStart;
- DateTime currentDayStart =
- new DateTime(App.TIMEZONE).withTimeAtStartOfDay().withZoneRetainFields(DateTimeZone.UTC);
- if (lastPartitionDayStart.isAfter(currentDayStart)
- || lastPartitionDayStart.isEqual(currentDayStart)) {
- processUntilDayStart = lastPartitionDayStart.minusDays(1);
- } else {
- processUntilDayStart = lastPartitionDayStart;
- }
- if (KYLIN_REFRESH_END_DATE != null) {
- DateTimeFormatter dateParser = ISODateTimeFormat.dateParser().withZone(App.TIMEZONE);
- processUntilDayStart = dateParser.parseDateTime(KYLIN_REFRESH_END_DATE);
- }
- logger.debug("Process until: " + processUntilDayStart + " (last partition: "
- + lastPartitionDayStart + ")");
- DateTime processUntilDayEnd = processUntilDayStart.plusDays(1);
- DateTime segmentStart = null;
- DateTime segmentEnd = null;
-
- /*
- * Before processing the last day of data, perform some additional data migrations to better
- * optimize any of the daily partitions that were originally populated from the streaming live
- * data.
- */
- TreeSet livePartitions = getPartitions(App.LOGS_LIVE_TABLE_NAME);
- for (DateTime livePartition : livePartitions) {
- if (livePartition.isBefore(processUntilDayStart) || livePartition == processUntilDayStart) {
- /*
- * First, ensure the day's partition is done writing to (this should generally be the case,
- * since this refresh kylin task is only called after midnight).
- */
- waitForNoWriteActivityOnDayPartition(livePartition);
-
- /*
- * Next, re-write the daily ORC partition with the live data. This better optimizes the data
- * for long-term storage (less stripes than the data originally written from streaming
- * appends). This also helps mitigate any potential edge-cases that may have led to the live
- * data being missed during the live append process.
- */
- overwriteDayPartitionFromLiveData(livePartition);
-
- /*
- * Finally, remove the live data for this day, since the ORC data is now the definitive
- * copy.
- */
- removeLiveDataForDay(livePartition);
- }
- }
-
- /* Loop through the existing segments and see if any of them need to be refreshed. */
- JsonArray segments = getSegments();
- for (JsonElement segment : segments) {
- JsonObject segmentObject = segment.getAsJsonObject();
-
- /* Kill this run and wait for the next one if any segments are still processing. */
- String segmentName = segmentObject.get("name").getAsString();
- String segmentStatus = segmentObject.get("status").getAsString();
- if (!segmentStatus.equals("READY")) {
- logger.info("Segment still processing, waiting for next refresh: " + segmentName + " - "
- + segmentStatus);
- return;
- }
-
- segmentStart =
- new DateTime(segmentObject.get("date_range_start").getAsLong(), DateTimeZone.UTC);
- segmentEnd = new DateTime(segmentObject.get("date_range_end").getAsLong(), DateTimeZone.UTC);
- DateTime segmentBuild =
- new DateTime(segmentObject.get("last_build_time").getAsLong(), DateTimeZone.UTC);
-
- /*
- * If the segment was built before the segment day is finished, then it needs to be refreshed.
- * We'll also continue refreshing for up to 50 minutes after the day is finished to account
- * for delayed data getting populated.
- */
- if (segmentBuild.isBefore(segmentEnd.plusMinutes(50))) {
- buildSegment("REFRESH", segmentStart, segmentEnd);
- }
- }
-
- /*
- * Define the next segment to build: Either the day following the last existing segment, or the
- * beginning of the cube if no existing segments exist.
- */
- if (segmentStart == null) {
- segmentStart = getCubeStart();
- } else {
- segmentStart = new DateTime(segmentEnd);
- }
-
- /* Loop over any segments that need building until the last partition day is hit. */
- while (segmentStart.isBefore(processUntilDayEnd)) {
- /*
- * Build a segment from the first day needed until the last partition day. But handle segments
- * longer than 1 full day a bit differently.
- */
- segmentEnd = new DateTime(processUntilDayEnd);
- if (Days.daysBetween(segmentStart, segmentEnd).getDays() > 1) {
- if (segmentStart.getYear() != segmentEnd.getYear()) {
- /*
- * If this segment would span separate years, then create separate segments for each
- * calendar year. When first importing historical data, this helps the segment out the
- * first big bulk processing.
- */
- segmentEnd = segmentStart.withDayOfYear(1).plusYears(1);
- } else {
- /*
- * Otherwise, if the segment would span multiple days, first create a segment until the
- * second to last day. Then we'll create a separate segment for the final day. This helps
- * ensure that the last day (which may contain live data), is put in its own segment, so
- * it's quicker to re-process once the day is complete.
- */
- segmentEnd = segmentEnd.minusDays(1);
- }
- }
-
- buildSegment("BUILD", segmentStart, segmentEnd);
- segmentStart = segmentEnd;
- }
-
- logger.info("Finish kylin refresh");
- }
-
- private JsonArray getSegments() throws HttpException, JsonSyntaxException, IOException {
- GetMethod method = new GetMethod(KYLIN_URL + "/api/cubes/" + CUBE_NAME);
- JsonObject result = makeRequest(method).getAsJsonObject();
- JsonArray segments = result.get("segments").getAsJsonArray();
-
- return segments;
- }
-
- private DateTime getCubeStart() throws HttpException, JsonSyntaxException, IOException {
- GetMethod method = new GetMethod(KYLIN_URL + "/api/cube_desc/" + CUBE_NAME);
- JsonObject result = makeRequest(method).getAsJsonArray().get(0).getAsJsonObject();
- DateTime start = new DateTime(result.get("partition_date_start").getAsLong(), DateTimeZone.UTC);
-
- return start;
- }
-
- private TreeSet getPartitions(String tableName)
- throws FileNotFoundException, IllegalArgumentException, IOException, URISyntaxException,
- ClassNotFoundException, SQLException {
- TreeSet partitions = new TreeSet();
- Pattern partitionDatePattern = Pattern.compile("timestamp_tz_date=(\\d{4}-\\d{2}-\\d{2})");
- PreparedStatement statement = connection.prepareStatement("SHOW PARTITIONS " + tableName);
- ResultSet rs = statement.executeQuery();
- while (rs.next()) {
- logger.debug(tableName + " partition: " + rs.getString("partition"));
- Matcher matcher = partitionDatePattern.matcher(rs.getString("partition").toString());
- DateTime partitionTime = null;
- while (matcher.find()) {
- partitionTime = new DateTime(matcher.group(1) + "T00:00:00");
- }
-
- partitions.add(partitionTime);
- }
-
- return partitions;
- }
-
- private void waitForNoWriteActivityOnDayPartition(DateTime partition)
- throws FileNotFoundException, IllegalArgumentException, IOException, URISyntaxException,
- InterruptedException {
- long inactiveTimeThreshold = new DateTime().minusMinutes(30).getMillis();
- long lastModifiedFile = 0;
- do {
- RemoteIterator filesIter = App.getHadoopFileSystem()
- .listFiles(new Path(App.HDFS_URI + App.HDFS_LOGS_ROOT //
- + "/timestamp_tz_year=" + dateFormatter.print(partition.withDayOfYear(1))
- + "/timestamp_tz_month=" + dateFormatter.print(partition.withDayOfMonth(1))
- + "/timestamp_tz_week=" + dateFormatter.print(partition.withDayOfWeek(1))
- + "/timestamp_tz_date=" + dateFormatter.print(partition)), true);
- while (filesIter.hasNext()) {
- FileStatus file = filesIter.next();
- logger.debug(file.getPath() + " file last modified: " + file.getModificationTime());
- if (file.getModificationTime() > lastModifiedFile) {
- lastModifiedFile = file.getModificationTime();
- }
- }
-
- if (lastModifiedFile > inactiveTimeThreshold) {
- logger.info("Previous day partition has recently written data. "
- + "Waiting until write activity hasn't occurred in 30 minutes. Last modification: "
- + lastModifiedFile);
- Thread.sleep(60000);
- }
- } while (lastModifiedFile == 0 || lastModifiedFile > inactiveTimeThreshold);
- }
-
- private void overwriteDayPartitionFromLiveData(DateTime partition)
- throws ClassNotFoundException, SQLException {
- logger.info("Begin overwrite of partition " + partition + " from " + App.LOGS_LIVE_TABLE_NAME);
-
- PreparedStatement statement = connection.prepareStatement(
- "SELECT COUNT(*) AS count FROM " + App.LOGS_TABLE_NAME + " WHERE timestamp_tz_date=?");
- statement.setString(1, dateFormatter.print(partition));
- ResultSet rs = statement.executeQuery();
- long beforePartitionCount = 0;
- if (rs.next()) {
- beforePartitionCount = rs.getLong("count");
- logger.debug("Partition (" + partition + ") count before overwrite: ", beforePartitionCount);
- }
-
- PreparedStatement migrate =
- connection.prepareStatement(ConvertLiveDataToOrc.getInsertOverwriteDaySql());
- migrate.setString(1, dateFormatter.print(partition.withDayOfYear(1)));
- migrate.setString(2, dateFormatter.print(partition.withDayOfMonth(1)));
- migrate.setString(3, dateFormatter.print(partition.withDayOfWeek(1)));
- migrate.setString(4, dateFormatter.print(partition));
- migrate.setString(5, dateFormatter.print(partition));
- migrate.executeUpdate();
- migrate.close();
-
- rs = statement.executeQuery();
- long afterPartitionCount = 0;
- if (rs.next()) {
- afterPartitionCount = rs.getLong("count");
- logger.debug("Partition (" + partition + ") count after overwrite: ", beforePartitionCount);
- }
-
- if (beforePartitionCount != afterPartitionCount) {
- logger.warn("Partition (" + partition + ") counts did not match");
- }
-
- logger.info("Finish overwrite of partition " + partition + " from " + App.LOGS_LIVE_TABLE_NAME);
- }
-
- private void removeLiveDataForDay(DateTime partition)
- throws SQLException, IllegalArgumentException, IOException, URISyntaxException {
- logger.info("Removing ");
- PreparedStatement addLivePartition =
- connection.prepareStatement(ConvertLiveDataToOrc.getDropLivePartitionsDaySql());
- addLivePartition.setString(1, dateFormatter.print(partition));
- addLivePartition.setString(2, hourMinuteFormatter.print(partition));
- addLivePartition.executeUpdate();
- addLivePartition.close();
-
- App.getHadoopFileSystem()
- .delete(new Path(App.HDFS_URI + App.HDFS_LOGS_LIVE_ROOT //
- + "/timestamp_tz_date=" + dateFormatter.print(partition) //
- + "/timestamp_tz_hour_minute=" + hourMinuteFormatter.print(partition)), true);
- }
-
- private void buildSegment(String buildType, DateTime start, DateTime end)
- throws HttpException, JsonSyntaxException, IOException, ClassNotFoundException,
- IllegalArgumentException, SQLException, URISyntaxException {
- /*
- * Before building any segments with Kylin, ensure that the table partitions are optimized by
- * merging multiple files into 1.
- */
- App.concatenateTablePartitions(logger);
-
- logger.info("Begin building segment (" + buildType + "): " + start + " - " + end);
-
- PutMethod method = new PutMethod(KYLIN_URL + "/api/cubes/" + CUBE_NAME + "/rebuild");
-
- JsonObject data = new JsonObject();
- data.addProperty("buildType", buildType);
- data.addProperty("startTime", start.getMillis());
- data.addProperty("endTime", end.getMillis());
-
- RequestEntity entity = new StringRequestEntity(data.toString(), "application/json", "UTF-8");
- method.setRequestEntity(entity);
-
- JsonObject result = makeRequest(method).getAsJsonObject();
- String jobUuid = result.get("uuid").getAsString();
- waitForJob(jobUuid);
-
- logger.info("Finish building segment (" + buildType + "): " + start + " - " + end);
- }
-
- private String getJobStatus(String jobUuid)
- throws HttpException, JsonSyntaxException, IOException {
- GetMethod method = new GetMethod(KYLIN_URL + "/api/jobs/" + jobUuid);
- JsonObject result = makeRequest(method).getAsJsonObject();
- String status = result.get("job_status").getAsString();
-
- logger.debug("Job status: " + jobUuid + ": " + status);
-
- return status;
- }
-
- private void waitForJob(String jobUuid) throws HttpException, JsonSyntaxException, IOException {
- try {
- while (!getJobStatus(jobUuid).equals("FINISHED")) {
- Thread.sleep(30000);
- }
- } catch (InterruptedException ex) {
- Thread.currentThread().interrupt();
- }
- }
-
- private JsonElement makeRequest(HttpMethod method)
- throws HttpException, JsonSyntaxException, IOException {
- JsonElement result = null;
- try {
- client.executeMethod(method);
- int responseStatus = method.getStatusLine().getStatusCode();
- InputStreamReader responseBody = new InputStreamReader(method.getResponseBodyAsStream());
- if (responseStatus != 200) {
- logger.error("Failed to make request: " + method.getURI() + " - " + responseStatus + " - "
- + IOUtils.toString(responseBody));
- throw new HttpException("Unsuccessful HTTP response");
- }
- result = new JsonParser().parse(responseBody);
- } catch (HttpException e) {
- throw e;
- } catch (JsonSyntaxException e) {
- throw e;
- } catch (IOException e) {
- throw e;
- } finally {
- method.releaseConnection();
- }
-
- return result;
- }
-}
diff --git a/src/api-umbrella/hadoop-analytics/processor/src/main/resources/log4j.properties b/src/api-umbrella/hadoop-analytics/processor/src/main/resources/log4j.properties
deleted file mode 100644
index 6286e1e20..000000000
--- a/src/api-umbrella/hadoop-analytics/processor/src/main/resources/log4j.properties
+++ /dev/null
@@ -1,6 +0,0 @@
-log4j.rootLogger=${apiumbrella.root_log_level},stdout
-log4j.logger.apiumbrella=${apiumbrella.log_level}
-log4j.appender.stdout=org.apache.log4j.ConsoleAppender
-log4j.appender.stdout.Target=System.out
-log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
-log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} [%p] %c{1} - %m%n
\ No newline at end of file
diff --git a/src/api-umbrella/hadoop-analytics/schema/.gitignore b/src/api-umbrella/hadoop-analytics/schema/.gitignore
deleted file mode 100644
index 7e5babc1e..000000000
--- a/src/api-umbrella/hadoop-analytics/schema/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-/.classpath
-/.project
-/.settings
-/dependency-reduced-pom.xml
-/target/
diff --git a/src/api-umbrella/hadoop-analytics/schema/pom.xml b/src/api-umbrella/hadoop-analytics/schema/pom.xml
deleted file mode 100644
index 73fd89030..000000000
--- a/src/api-umbrella/hadoop-analytics/schema/pom.xml
+++ /dev/null
@@ -1,33 +0,0 @@
-
- 4.0.0
-
-
- api-umbrella.hadoop-analytics
- hadoop-analytics
- 0.0.1-SNAPSHOT
-
- schema
- jar
-
- schema
- http://maven.apache.org
-
-
- UTF-8
-
-
-
- ${buildDir}/schema
-
-
-
-
-
- org.apache.avro
- avro
- 1.8.0
-
-
-
diff --git a/src/api-umbrella/hadoop-analytics/schema/src/main/java/apiumbrella/hadoop_analytics/LogSchema.java b/src/api-umbrella/hadoop-analytics/schema/src/main/java/apiumbrella/hadoop_analytics/LogSchema.java
deleted file mode 100644
index e0068b8e9..000000000
--- a/src/api-umbrella/hadoop-analytics/schema/src/main/java/apiumbrella/hadoop_analytics/LogSchema.java
+++ /dev/null
@@ -1,139 +0,0 @@
-package apiumbrella.hadoop_analytics;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-
-import org.apache.avro.Schema;
-import org.apache.avro.Schema.Type;
-
-public class LogSchema {
- private Schema schema;
- private HashMap fieldTypes;
- private ArrayList fieldNames;
- private ArrayList partitionFields = new ArrayList();
- private ArrayList nonPartitionFields = new ArrayList();
- private ArrayList livePartitionFields = new ArrayList();
- private ArrayList liveNonPartitionFields = new ArrayList();
- private HashSet shortFields = new HashSet();
- private HashSet dateFields = new HashSet();
-
- public LogSchema() {
- fieldTypes = new HashMap();
- fieldNames = new ArrayList();
- for (Schema.Field field : getSchema().getFields()) {
- Schema.Type type = field.schema().getType();
- if (type == Schema.Type.UNION) {
- for (Schema unionSchema : field.schema().getTypes()) {
- if (unionSchema.getType() != Schema.Type.NULL) {
- type = unionSchema.getType();
- break;
- }
- }
- }
-
- fieldTypes.put(field.name(), type);
- fieldNames.add(field.name());
- }
-
- // Explicitly define which fields we'll be partitioning by, since these
- // don't need to be sorted in the output file (since they're part of the
- // file path, it's duplicative to store this data in the file).
- partitionFields.add("timestamp_tz_year");
- partitionFields.add("timestamp_tz_month");
- partitionFields.add("timestamp_tz_week");
- partitionFields.add("timestamp_tz_date");
-
- livePartitionFields.add("timestamp_tz_date");
- livePartitionFields.add("timestamp_tz_hour_minute");
- fieldTypes.put("timestamp_tz_hour_minute", Schema.Type.STRING);
-
- for (String name : fieldNames) {
- if (!partitionFields.contains(name)) {
- nonPartitionFields.add(name);
- }
-
- if (!livePartitionFields.contains(name)) {
- liveNonPartitionFields.add(name);
- }
- }
-
- // Define fields we want to store as short/smallints. Since Avro doesn't
- // support these in its schema, but ORC does, we need to explicitly list
- // these.
- shortFields.add("response_status");
-
- dateFields.add("timestamp_tz_year");
- dateFields.add("timestamp_tz_month");
- dateFields.add("timestamp_tz_week");
- dateFields.add("timestamp_tz_date");
- }
-
- protected Schema getSchema() {
- if (schema == null) {
- InputStream is = Schema.class.getClassLoader().getResourceAsStream("log.avsc");
- try {
- schema = new Schema.Parser().parse(is);
- } catch (IOException e) {
- e.printStackTrace();
- System.exit(1);
- }
- }
- return schema;
- }
-
- protected Type getFieldType(String field) {
- return fieldTypes.get(field);
- }
-
- protected boolean isFieldTypeShort(String field) {
- return shortFields.contains(field);
- }
-
- protected String getFieldHiveType(String field) {
- Type type = getFieldType(field);
- if (type == Schema.Type.INT) {
- if (isFieldTypeShort(field)) {
- return "SMALLINT";
- } else {
- return "INT";
- }
- } else if (type == Schema.Type.LONG) {
- return "BIGINT";
- } else if (type == Schema.Type.DOUBLE) {
- return "DOUBLE";
- } else if (type == Schema.Type.BOOLEAN) {
- return "BOOLEAN";
- } else if (type == Schema.Type.STRING) {
- if (dateFields.contains(field)) {
- return "DATE";
- } else {
- return "STRING";
- }
- } else {
- return null;
- }
- }
-
- protected ArrayList getFieldNames() {
- return fieldNames;
- }
-
- protected ArrayList getPartitionFieldsList() {
- return partitionFields;
- }
-
- protected ArrayList getNonPartitionFieldsList() {
- return nonPartitionFields;
- }
-
- protected ArrayList getLivePartitionFieldsList() {
- return livePartitionFields;
- }
-
- protected ArrayList getLiveNonPartitionFieldsList() {
- return liveNonPartitionFields;
- }
-}
diff --git a/src/api-umbrella/hadoop-analytics/schema/src/main/resources/log.avsc b/src/api-umbrella/hadoop-analytics/schema/src/main/resources/log.avsc
deleted file mode 100644
index 402b2bd18..000000000
--- a/src/api-umbrella/hadoop-analytics/schema/src/main/resources/log.avsc
+++ /dev/null
@@ -1,222 +0,0 @@
-{
- "type": "record",
- "name": "Log",
- "fields": [
- {
- "name": "timestamp_utc",
- "type": "long"
- },
- {
- "name": "id",
- "type": "string"
- },
- {
- "name": "timestamp_tz_offset",
- "type": "int"
- },
- {
- "name": "timestamp_tz_year",
- "type": "string"
- },
- {
- "name": "timestamp_tz_month",
- "type": "string"
- },
- {
- "name": "timestamp_tz_week",
- "type": "string"
- },
- {
- "name": "timestamp_tz_date",
- "type": "string"
- },
- {
- "name": "timestamp_tz_hour",
- "type": "string"
- },
- {
- "name": "timestamp_tz_minute",
- "type": "string"
- },
- {
- "name": "user_id",
- "type": ["null", "string"]
- },
- {
- "name": "denied_reason",
- "type": ["null", "string"]
- },
- {
- "name": "request_method",
- "type": "string"
- },
- {
- "name": "request_url_scheme",
- "type": "string"
- },
- {
- "name": "request_url_host",
- "type": "string"
- },
- {
- "name": "request_url_port",
- "type": "int"
- },
- {
- "name": "request_url_path",
- "type": "string"
- },
- {
- "name": "request_url_path_level1",
- "type": ["null", "string"]
- },
- {
- "name": "request_url_path_level2",
- "type": ["null", "string"]
- },
- {
- "name": "request_url_path_level3",
- "type": ["null", "string"]
- },
- {
- "name": "request_url_path_level4",
- "type": ["null", "string"]
- },
- {
- "name": "request_url_path_level5",
- "type": ["null", "string"]
- },
- {
- "name": "request_url_path_level6",
- "type": ["null", "string"]
- },
- {
- "name": "request_url_query",
- "type": ["null", "string"]
- },
- {
- "name": "request_ip",
- "type": "string"
- },
- {
- "name": "request_ip_country",
- "type": ["null", "string"]
- },
- {
- "name": "request_ip_region",
- "type": ["null", "string"]
- },
- {
- "name": "request_ip_city",
- "type": ["null", "string"]
- },
- {
- "name": "request_ip_lat",
- "type": ["null", "double"]
- },
- {
- "name": "request_ip_lon",
- "type": ["null", "double"]
- },
- {
- "name": "request_user_agent",
- "type": ["null", "string"]
- },
- {
- "name": "request_user_agent_type",
- "type": ["null", "string"]
- },
- {
- "name": "request_user_agent_family",
- "type": ["null", "string"]
- },
- {
- "name": "request_size",
- "type": ["null", "int"]
- },
- {
- "name": "request_accept",
- "type": ["null", "string"]
- },
- {
- "name": "request_accept_encoding",
- "type": ["null", "string"]
- },
- {
- "name": "request_content_type",
- "type": ["null", "string"]
- },
- {
- "name": "request_connection",
- "type": ["null", "string"]
- },
- {
- "name": "request_origin",
- "type": ["null", "string"]
- },
- {
- "name": "request_referer",
- "type": ["null", "string"]
- },
- {
- "name": "request_basic_auth_username",
- "type": ["null", "string"]
- },
- {
- "name": "response_status",
- "type": "int"
- },
- {
- "name": "response_content_type",
- "type": ["null", "string"]
- },
- {
- "name": "response_content_length",
- "type": ["null", "int"]
- },
- {
- "name": "response_content_encoding",
- "type": ["null", "string"]
- },
- {
- "name": "response_transfer_encoding",
- "type": ["null", "string"]
- },
- {
- "name": "response_server",
- "type": ["null", "string"]
- },
- {
- "name": "response_cache",
- "type": ["null", "string"]
- },
- {
- "name": "response_age",
- "type": ["null", "int"]
- },
- {
- "name": "response_size",
- "type": ["null", "int"]
- },
- {
- "name": "timer_response",
- "type": ["null", "double"]
- },
- {
- "name": "timer_backend_response",
- "type": ["null", "double"]
- },
- {
- "name": "timer_internal",
- "type": ["null", "double"]
- },
- {
- "name": "timer_proxy_overhead",
- "type": ["null", "double"]
- },
- {
- "name": "log_imported",
- "type": ["null", "boolean"]
- }
- ]
-}
diff --git a/src/api-umbrella/web-app/Gemfile b/src/api-umbrella/web-app/Gemfile
index 7d7cae2ba..6f7d8b713 100644
--- a/src/api-umbrella/web-app/Gemfile
+++ b/src/api-umbrella/web-app/Gemfile
@@ -18,9 +18,6 @@ gem "rack-proxy", "~> 0.6.4"
gem "multi_json", "~> 1.13.1"
gem "oj", "~> 3.6.0", :platforms => [:ruby]
-# SQL escape libraries for Kylin analytics.
-gem "sequel", "~> 5.8.0"
-
# MongoDB
gem "mongoid", "~> 5.2.1"
diff --git a/src/api-umbrella/web-app/Gemfile.lock b/src/api-umbrella/web-app/Gemfile.lock
index 9e704a372..1231a4a58 100644
--- a/src/api-umbrella/web-app/Gemfile.lock
+++ b/src/api-umbrella/web-app/Gemfile.lock
@@ -282,7 +282,6 @@ GEM
sprockets (>= 2.8, < 4.0)
sprockets-rails (>= 2.0, < 4.0)
tilt (>= 1.1, < 3)
- sequel (5.8.0)
simple_form (3.5.1)
actionpack (> 4, < 5.2)
activemodel (> 4, < 5.2)
@@ -353,7 +352,6 @@ DEPENDENCIES
rollbar (~> 2.15.5)
safe_yaml (~> 1.0.4)
sass-rails (~> 5.0)
- sequel (~> 5.8.0)
simple_form (~> 3.5.1)
BUNDLED WITH
diff --git a/src/api-umbrella/web-app/app/controllers/admin/sessions_controller.rb b/src/api-umbrella/web-app/app/controllers/admin/sessions_controller.rb
index 4332f6ada..530e225cb 100644
--- a/src/api-umbrella/web-app/app/controllers/admin/sessions_controller.rb
+++ b/src/api-umbrella/web-app/app/controllers/admin/sessions_controller.rb
@@ -17,7 +17,6 @@ def auth
if current_admin
response.merge!({
"analytics_timezone" => ApiUmbrellaConfig[:analytics][:timezone],
- "enable_beta_analytics" => (ApiUmbrellaConfig[:analytics][:adapter] == "kylin" || (ApiUmbrellaConfig[:analytics][:outputs] && ApiUmbrellaConfig[:analytics][:outputs].include?("kylin"))),
"username_is_email" => ApiUmbrellaConfig[:web][:admin][:username_is_email],
"local_auth_enabled" => ApiUmbrellaConfig[:web][:admin][:auth_strategies][:_enabled][:local],
"password_length_min" => ApiUmbrellaConfig[:web][:admin][:password_length_min],
diff --git a/src/api-umbrella/web-app/app/controllers/admin/stats_controller.rb b/src/api-umbrella/web-app/app/controllers/admin/stats_controller.rb
index 62b4c8b0f..90baa8c77 100644
--- a/src/api-umbrella/web-app/app/controllers/admin/stats_controller.rb
+++ b/src/api-umbrella/web-app/app/controllers/admin/stats_controller.rb
@@ -37,16 +37,8 @@ def search
end
def logs
- # TODO: For the SQL fetching, set start_time to end_time to limit to last
- # 24 hours. If we do end up limiting it to the last 24 hours by default,
- # figure out a better way to document this and still allow downloading
- # the full data set.
- start_time = params[:start_at]
- if(@analytics_adapter == "kylin")
- start_time = Time.zone.parse(params[:end_at]) - 1.day
- end
@search = LogSearch.factory(@analytics_adapter, {
- :start_time => start_time,
+ :start_time => params[:start_at],
:end_time => params[:end_at],
:interval => params[:interval],
})
diff --git a/src/api-umbrella/web-app/app/controllers/api/v0/analytics_controller.rb b/src/api-umbrella/web-app/app/controllers/api/v0/analytics_controller.rb
index 7bc642e6c..25eab07cd 100644
--- a/src/api-umbrella/web-app/app/controllers/api/v0/analytics_controller.rb
+++ b/src/api-umbrella/web-app/app/controllers/api/v0/analytics_controller.rb
@@ -107,9 +107,9 @@ def generate_summary
:end_time => Time.now.utc,
:interval => "month",
- # This query can take a long time to run against PrestoDB, so set a long
- # timeout. But since we're only delivering cached results and refreshing
- # periodically in the background, this long timeout should be okay.
+ # This query can take a long time to run, so set a long timeout. But
+ # since we're only delivering cached results and refreshing periodically
+ # in the background, this long timeout should be okay.
:query_timeout => 20 * 60, # 20 minutes
})
diff --git a/src/api-umbrella/web-app/app/controllers/application_controller.rb b/src/api-umbrella/web-app/app/controllers/application_controller.rb
index 182f77980..86930ffad 100644
--- a/src/api-umbrella/web-app/app/controllers/application_controller.rb
+++ b/src/api-umbrella/web-app/app/controllers/application_controller.rb
@@ -91,11 +91,7 @@ def use_locale
end
def set_analytics_adapter
- if(params[:beta_analytics] == "true")
- @analytics_adapter = "kylin"
- else
- @analytics_adapter = ApiUmbrellaConfig[:analytics][:adapter]
- end
+ @analytics_adapter = ApiUmbrellaConfig[:analytics][:adapter]
end
def set_time_zone
diff --git a/src/api-umbrella/web-app/app/models/log_result.rb b/src/api-umbrella/web-app/app/models/log_result.rb
index f5309e9ca..825f220e8 100644
--- a/src/api-umbrella/web-app/app/models/log_result.rb
+++ b/src/api-umbrella/web-app/app/models/log_result.rb
@@ -3,10 +3,6 @@ def self.factory(search, raw_result)
case(search)
when LogSearch::ElasticSearch
LogResult::ElasticSearch.new(search, raw_result)
- when LogSearch::Kylin
- LogResult::Kylin.new(search, raw_result)
- when LogSearch::Postgresql
- LogResult::Postgresql.new(search, raw_result)
end
end
end
diff --git a/src/api-umbrella/web-app/app/models/log_result/kylin.rb b/src/api-umbrella/web-app/app/models/log_result/kylin.rb
deleted file mode 100644
index 7e3ede752..000000000
--- a/src/api-umbrella/web-app/app/models/log_result/kylin.rb
+++ /dev/null
@@ -1,2 +0,0 @@
-class LogResult::Kylin < LogResult::Sql
-end
diff --git a/src/api-umbrella/web-app/app/models/log_result/postgresql.rb b/src/api-umbrella/web-app/app/models/log_result/postgresql.rb
deleted file mode 100644
index a5a699ab5..000000000
--- a/src/api-umbrella/web-app/app/models/log_result/postgresql.rb
+++ /dev/null
@@ -1,2 +0,0 @@
-class LogResult::Postgresql < LogResult::Sql
-end
diff --git a/src/api-umbrella/web-app/app/models/log_result/sql.rb b/src/api-umbrella/web-app/app/models/log_result/sql.rb
deleted file mode 100644
index 5b21e82b1..000000000
--- a/src/api-umbrella/web-app/app/models/log_result/sql.rb
+++ /dev/null
@@ -1,20 +0,0 @@
-class LogResult::Sql < LogResult::Base
- def initialize(search, raw_result)
- super
-
- if(search.result_processors.present?)
- search.result_processors.each do |processor|
- processor.call(self)
- end
- end
- end
-
- def column_indexes(query_name)
- column_indexes = {}
- raw_result[query_name]["columnMetas"].each_with_index do |meta, index|
- column_indexes[meta["label"].downcase] = index
- end
-
- column_indexes
- end
-end
diff --git a/src/api-umbrella/web-app/app/models/log_search.rb b/src/api-umbrella/web-app/app/models/log_search.rb
index 7ffbd9b3b..f0a2288be 100644
--- a/src/api-umbrella/web-app/app/models/log_search.rb
+++ b/src/api-umbrella/web-app/app/models/log_search.rb
@@ -3,10 +3,6 @@ def self.factory(adapter, options = {})
case(adapter)
when "elasticsearch"
LogSearch::ElasticSearch.new(options)
- when "kylin"
- LogSearch::Kylin.new(options)
- when "postgresql"
- LogSearch::Postgresql.new(options)
end
end
end
diff --git a/src/api-umbrella/web-app/app/models/log_search/kylin.rb b/src/api-umbrella/web-app/app/models/log_search/kylin.rb
deleted file mode 100644
index 4ae5c176c..000000000
--- a/src/api-umbrella/web-app/app/models/log_search/kylin.rb
+++ /dev/null
@@ -1,112 +0,0 @@
-class LogSearch::Kylin < LogSearch::Sql
- def execute_kylin(sql)
- @kylin_conn ||= Faraday.new(:url => "#{ApiUmbrellaConfig[:kylin][:protocol]}://#{ApiUmbrellaConfig[:kylin][:host]}:#{ApiUmbrellaConfig[:kylin][:port]}") do |faraday|
- faraday.response :logger
- faraday.adapter Faraday.default_adapter
- faraday.basic_auth "ADMIN", "KYLIN"
- end
- if(ApiUmbrellaConfig[:kylin][:auth])
- @kylin_conn.basic_auth(ApiUmbrellaConfig[:kylin][:auth][:username], ApiUmbrellaConfig[:kylin][:auth][:password])
- end
-
- Rails.logger.info(sql)
- response = @kylin_conn.post do |req|
- req.url "/kylin/api/query"
- req.headers["Content-Type"] = "application/json"
- req.body = MultiJson.dump({
- :acceptPartial => false,
- :project => "api_umbrella",
- :sql => sql,
- })
- end
-
- if(response.status != 200)
- Rails.logger.error(response.body)
- raise "Kylin Error"
- end
-
- MultiJson.load(response.body)
- end
-
- def execute_presto(sql)
- @presto_conn ||= Faraday.new(:url => "#{ApiUmbrellaConfig[:presto][:protocol]}://#{ApiUmbrellaConfig[:presto][:host]}:#{ApiUmbrellaConfig[:presto][:port]}") do |faraday|
- faraday.response :logger
- faraday.adapter Faraday.default_adapter
- end
- if(ApiUmbrellaConfig[:presto][:auth])
- @presto_conn.basic_auth(ApiUmbrellaConfig[:presto][:auth][:username], ApiUmbrellaConfig[:presto][:auth][:password])
- end
-
- Rails.logger.info(sql)
- response = @presto_conn.post do |req|
- req.url "/v1/statement"
- req.headers["Content-Type"] = "text/plain"
- req.headers["X-Presto-User"] = "presto"
- req.headers["X-Presto-Catalog"] = "hive"
- req.headers["X-Presto-Schema"] = "default"
- if(@options[:query_timeout])
- req.headers["X-Presto-Session"] = "query_max_run_time=#{@options[:query_timeout]}s"
- end
- req.body = sql
- end
-
- results = {
- "columnMetas" => [],
- "results" => [],
- }
- while(response.status == 200)
- query_result = MultiJson.load(response.body)
- if(query_result["stats"] && query_result["stats"]["state"] == "FAILED")
- Rails.logger.error(response.body)
- raise "Presto Error"
- end
-
- if(results["columnMetas"].empty? && query_result["columns"])
- results["columnMetas"] = query_result["columns"].map do |column|
- {
- "label" => column["name"],
- }
- end
- end
-
- if(query_result["data"].present?)
- results["results"] += query_result["data"]
- end
-
- if(query_result["nextUri"])
- response = @presto_conn.get do |req|
- req.url(query_result["nextUri"])
- end
- else
- break
- end
- end
-
- if(response.status != 200)
- Rails.logger.error(response.body)
- raise "Presto Error"
- end
-
- results
- end
-
- def execute_query(query_name, query = {})
- unless @query_results[query_name]
- sql = build_query(query)
-
- if(@needs_presto)
- results = execute_presto(sql)
- else
- begin
- results = execute_kylin(sql)
- rescue
- results = execute_presto(sql)
- end
- end
-
- @query_results[query_name] = results
- end
-
- @query_results[query_name]
- end
-end
diff --git a/src/api-umbrella/web-app/app/models/log_search/postgresql.rb b/src/api-umbrella/web-app/app/models/log_search/postgresql.rb
deleted file mode 100644
index 227a1bec1..000000000
--- a/src/api-umbrella/web-app/app/models/log_search/postgresql.rb
+++ /dev/null
@@ -1,2 +0,0 @@
-class LogSearch::Postgresql < LogSearch::Sql
-end
diff --git a/src/api-umbrella/web-app/app/models/log_search/sql.rb b/src/api-umbrella/web-app/app/models/log_search/sql.rb
deleted file mode 100644
index cd44e846a..000000000
--- a/src/api-umbrella/web-app/app/models/log_search/sql.rb
+++ /dev/null
@@ -1,926 +0,0 @@
-class LogSearch::Sql < LogSearch::Base
- attr_reader :result_processors
-
- NOT_NULL_FIELDS = [
- "request_ip",
- ].freeze
-
- LEGACY_FIELDS = {
- "backend_response_time" => "timer_backend_response",
- "gatekeeper_denied_code" => "denied_reason",
- "imported" => "log_imported",
- "internal_gatekeeper_time" => "timer_internal",
- "proxy_overhead" => "timer_proxy_overhead",
- "request_at" => "timestamp_utc",
- "request_host" => "request_url_host",
- "request_path" => "request_url_path",
- "request_scheme" => "request_url_scheme",
- "response_time" => "timer_response",
- }.freeze
-
- FIELD_TYPES = {
- "response_status" => :int,
- }.freeze
-
- def initialize(options = {})
- super
-
- @sequel = Sequel.connect("mock://postgresql")
-
- case(@interval)
- when "minute"
- @interval_field = "timestamp_tz_minute"
- @interval_field_format = "%Y-%m-%d %H:%M:%S"
- when "hour"
- @interval_field = "timestamp_tz_hour"
- @interval_field_format = "%Y-%m-%d %H:%M:%S"
- when "day"
- @interval_field = "timestamp_tz_date"
- @interval_field_format = "%Y-%m-%d"
- when "week"
- @interval_field = "timestamp_tz_week"
- @interval_field_format = "%Y-%m-%d"
- when "month"
- @interval_field = "timestamp_tz_month"
- @interval_field_format = "%Y-%m-%d"
- end
-
- @query = {
- :select => [],
- :where => [],
- :group_by => [],
- :order_by => [],
- }
-
- @queries = {}
- @query_results = {}
-
- @query_options = {
- :size => 0,
- :ignore_unavailable => "missing",
- :allow_no_indices => true,
- }
-
- @result_processors = []
- end
-
- def build_query(query)
- select = @query[:select] + (query[:select] || [])
- sql = "SELECT #{select.join(", ")} FROM api_umbrella.logs"
-
- where = @query[:where] + (query[:where] || [])
- if(where.present?)
- sql << " WHERE #{where.map { |clause| "(#{clause})" }.join(" AND ")}"
- end
-
- group_by = @query[:group_by] + (query[:group_by] || [])
- if(group_by.present?)
- sql << " GROUP BY #{group_by.join(", ")}"
- end
-
- order_by = @query[:order_by] + (query[:order_by] || [])
- if(order_by.present?)
- sql << " ORDER BY #{order_by.join(", ")}"
- end
-
- limit = query[:limit] || @query[:limit]
- if(limit.present?)
- sql << " LIMIT #{limit}"
- end
-
- sql
- end
-
- def result
- if @none
- @result = LogResult.factory(self, {})
- return @result
- end
-
- if(@query_results.empty? || @queries[:default])
- execute_query(:default, @queries[:default] || {})
- end
-
- @result = LogResult.factory(self, @query_results)
- end
-
- def permission_scope!(scopes)
- filter = []
- scopes["rules"].each do |rule|
- filter << parse_query_builder(rule)
- end
-
- @query[:where] << filter.join(" OR ")
- end
-
- def search_type!(search_type)
- if(search_type == "count")
- @queries[:default] ||= {}
- @queries[:default][:select] ||= []
- @queries[:default][:select] << "COUNT(*) AS total_count"
-
- @result_processors << proc do |result|
- count = 0
- column_indexes = result.column_indexes(:default)
- result.raw_result[:default]["results"].each do |row|
- count = row[column_indexes["total_count"]].to_i
- end
-
- result.raw_result["hits"] ||= {}
- result.raw_result["hits"]["total"] = count
- end
- end
- end
-
- def search!(query_string)
- if(query_string.present?)
- parser = LuceneQueryParser::Parser.new
-
- # LuceneQueryParser doesn't support the "!" alias for NOT. This is a
- # quick hack, but we should address in the gem for better parsing
- # accuracy (otherwise, this could replace ! inside strings).
- query_string.gsub!("!", " NOT ")
-
- data = parser.parse(query_string)
-
- query = {
- "condition" => "AND",
- "rules" => [],
- }
-
- data.each do |expression|
- rule = {
- "id" => expression[:field].to_s,
- "field" => expression[:field].to_s,
- "operator" => "equal",
- "value" => expression[:term].to_s,
- }
-
- if(expression[:term])
- rule["value"] = expression[:term].to_s
- if(rule["value"].end_with?("*"))
- rule["value"].gsub!(/\*$/, "")
- rule["operator"] = "begins_with"
- elsif(rule["value"].start_with?("*"))
- rule["value"].gsub!(/^\*/, "")
- rule["operator"] = "ends_with"
- else
- rule["operator"] = "equal"
- end
-
- if(expression[:op].to_s == "NOT")
- rule["operator"] = "not_#{rule["operator"]}"
- end
-
- query["rules"] << rule
- elsif(expression[:inclusive_range])
- query["rules"] << rule.merge({
- "operator" => "greater_or_equal",
- "value" => expression[:inclusive_range][:from].to_s,
- })
-
- query["rules"] << rule.merge({
- "operator" => "less_or_equal",
- "value" => expression[:inclusive_range][:to].to_s,
- })
- elsif(expression[:exclusive_range])
- query["rules"] << rule.merge({
- "operator" => "greater",
- "value" => expression[:exclusive_range][:from].to_s,
- })
-
- query["rules"] << rule.merge({
- "operator" => "less",
- "value" => expression[:exclusive_range][:to].to_s,
- })
- end
- end
-
- filter = parse_query_builder(query)
- if(filter.present?)
- @query[:where] << filter
- end
- end
- end
-
- def query!(query)
- if(query.kind_of?(String) && query.present?)
- query = MultiJson.load(query)
- end
-
- filter = parse_query_builder(query)
- if(filter.present?)
- @query[:where] << filter
- end
- end
-
- def parse_query_builder(query)
- query_filter = nil
-
- if(query.present?)
- filters = []
- query["rules"].each do |rule|
- field = rule["field"]
- if(LEGACY_FIELDS[field])
- field = LEGACY_FIELDS[field]
- end
-
- filter = nil
- operator = nil
- value = rule["value"]
-
- if(!CASE_SENSITIVE_FIELDS.include?(rule["field"]) && value.kind_of?(String))
- # FIXME: Is this needed now that everything is case-sensitive in SQL?
- # value.downcase!
- end
-
- if(value.present?)
- case(FIELD_TYPES[field])
- when :int
- value = Integer(value)
- when :double
- value = Float(value)
- end
- end
-
- case(field)
- when "request_method"
- value.upcase!
- end
-
- case(rule["operator"])
- when "equal"
- operator = "="
- when "not_equal"
- # Use IS DISTINCT FROM instead of <> (aka !=), since we want to treat
- # NULL values as not equal to the given value.
- operator = "IS DISTINCT FROM"
- when "begins_with"
- operator = "LIKE"
- value = "#{value}%"
- when "not_begins_with"
- operator = "NOT LIKE"
- value = "#{value}%"
- when "ends_with"
- operator = "LIKE"
- value = "%#{value}"
- when "not_ends_with"
- operator = "NOT LIKE"
- value = "%#{value}"
- when "contains"
- operator = "LIKE"
- value = "%#{value}%"
- when "not_contains"
- operator = "NOT LIKE"
- value = "%#{value}%"
- when "is_null"
- operator = "IS NULL"
- value = nil
- when "is_not_null"
- operator = "IS NOT NULL"
- value = nil
- when "less"
- operator = "<"
- when "less_or_equal"
- operator = "<="
- when "greater"
- operator = ">"
- when "greater_or_equal"
- operator = ">="
- when "between"
- values = rule["value"].map { |v| v.to_f }.sort
- filter = "#{@sequel.quote_identifier(field)} >= #{@sequel.literal(values[0])} AND #{@sequel.quote_identifier(field)} <= #{@sequel.literal(values[1])}"
- else
- raise "unknown filter operator: #{rule["operator"]} (rule: #{rule.inspect})"
- end
-
- if(field == "request_url_path" && ["equal", "not_equal", "begins_with", "not_begins_with"].include?(rule["operator"]))
- level_filters = []
- levels = value.split(%r{/+}, 6).reject { |l| l.blank? }
- levels.each_with_index do |level, index|
- level_field = "request_url_path_level#{index + 1}"
-
- level_value = level.dup
- if(index == 0)
- level_value = "/#{level_value}"
- end
- if(index < levels.length - 1)
- level_value = "#{level_value}/"
- end
-
- if(index < levels.length - 1)
- if(["not_equal", "not_begins_with"].include?(rule["operator"]))
- level_operator = "IS DISTINCT FROM"
- else
- level_operator = "="
- end
- else
- level_operator = operator
- end
-
- level_filter = "#{@sequel.quote_identifier(level_field)} #{level_operator}"
- unless(level_value.nil?)
- level_filter << " #{@sequel.literal(level_value)}"
- end
-
- level_filters << level_filter
- end
-
- filter = level_filters.join(" AND ")
- end
-
- unless(filter)
- filter = "#{@sequel.quote_identifier(field)} #{operator}"
- unless(value.nil?)
- filter << " #{@sequel.literal(value)}"
- end
- end
-
- filters << filter
- end
-
- if(filters.present?)
- where = filters.map { |w| "(#{w})" }
- if(query["condition"] == "OR")
- query_filter = where.join(" OR ")
- else
- query_filter = where.join(" AND ")
- end
- end
- end
-
- query_filter
- end
-
- def offset!(from)
- @query_options[:from] = from
- end
-
- def limit!(size)
- @query[:limit] = size
- end
-
- def sort!(sort)
- sort.each do |s|
- field = s.keys.first
- order = s.values.first
- if(order == "desc" || order == "asc")
- order = order.upcase
- else
- order = "ASC"
- end
-
- @query[:order_by] << "#{@sequel.quote_identifier(field)} #{order}"
- end
-
- @query[:sort] = sort
- end
-
- def exclude_imported!
- @query[:where] << "log_imported IS DISTINCT FROM true"
- end
-
- def filter_by_date_range!
- @query[:where] << @sequel.literal(Sequel.lit("timestamp_tz_date >= CAST(:start_time_date AS DATE) AND timestamp_tz_date <= CAST(:end_time_date AS DATE)", {
- :start_time_date => @start_time.strftime("%Y-%m-%d"),
- :end_time_date => @end_time.strftime("%Y-%m-%d"),
- }))
- end
-
- def filter_by_request_path!(request_path)
- @query[:query][:filtered][:filter][:bool][:must] << {
- :term => {
- :request_path => request_path,
- },
- }
- end
-
- def filter_by_api_key!(api_key)
- user = ApiUser.where(:api_key => api_key).first
- @query[:where] << @sequel.literal(Sequel.lit("user_id = ?", user.id))
- end
-
- def filter_by_user!(user_email)
- user = ApiUser.where(:email => user_email).first
- @query[:where] << @sequel.literal(Sequel.lit("user_id = ?", user.id))
- end
-
- def filter_by_user_ids!(user_ids)
- @query[:where] << @sequel.literal(Sequel.lit("user_id IN ?", user_ids))
- end
-
- def aggregate_by_drilldown!(prefix, size = 0)
- @drilldown_prefix_segments = prefix.split("/")
- @drilldown_depth = @drilldown_prefix_segments[0].to_i
-
- # Define the hierarchy of fields that will be involved in this query given
- # the depth.
- @drilldown_fields = ["request_url_host"]
- (1..@drilldown_depth).each do |i|
- @drilldown_fields << "request_url_path_level#{i}"
- end
- @drilldown_depth_field = @drilldown_fields.last
-
- # Define common parts of the query for all drilldown queries.
- @drilldown_common_query = {
- :select => ["COUNT(*) AS hits"],
- :where => [],
- :group_by => [],
- }
- @drilldown_fields.each_with_index do |field, index|
- # If we're grouping by the top-level of the hierarchy (the hostname), we
- # need to perform some custom logic to determine whether the hostname has
- # any children data.
- #
- # For all the request_url_path_level* fields, we store the value with a
- # trailing slash if there's further parts of the hierarchy. This gives us
- # an easy way to determine whether the specific level is the terminating
- # level of the hierarchy. In other words, this gives us a way to
- # distinguish traffic ending in /foo versus /foo/bar (where /foo is a
- # parent level). Since the host field doesn't follow this convention, we
- # must emulate it here by looking at the level1 path to see if it's NULL
- # or NOT NULL, and append the appropriate slash.
- if(@drilldown_depth == 0)
- @drilldown_common_query[:select] << "request_url_host || CASE WHEN request_url_path_level1 IS NULL THEN '' ELSE '/' END AS request_url_host"
- @drilldown_common_query[:group_by] << "request_url_host, CASE WHEN request_url_path_level1 IS NULL THEN '' ELSE '/' END"
- else
- @drilldown_common_query[:select] << field
- @drilldown_common_query[:group_by] << field
- end
- @drilldown_common_query[:where] << "#{field} IS NOT NULL"
-
- # Match all the parts of the host and path that are part of the prefix.
- prefix_match_value = @drilldown_prefix_segments[index + 1]
- if prefix_match_value
- # Since we're only interested in matching the parent values, all of the
- # request_url_path_level* fields should have trailing slashes (since
- # that denotes that there's child data). request_url_path_level1 should
- # also have a slash prefix (since it's the beginning of the path).
- if(index == 1)
- prefix_match_value = "/#{prefix_match_value}/"
- elsif(index > 1)
- prefix_match_value = "#{prefix_match_value}/"
- end
- @drilldown_common_query[:where] << @sequel.literal(Sequel.lit("#{field} = ?", prefix_match_value))
- end
- end
-
- execute_query(:drilldown, {
- :select => @drilldown_common_query[:select],
- :where => @drilldown_common_query[:where],
- :group_by => @drilldown_common_query[:group_by],
- :order_by => ["hits DESC"],
- })
-
- # Massage the query into the aggregation format matching our old
- # elasticsearch queries.
- @result_processors << proc do |result|
- buckets = []
- column_indexes = result.column_indexes(:drilldown)
- result.raw_result[:drilldown]["results"].each do |row|
- buckets << {
- "key" => build_drilldown_prefix_from_result(row, column_indexes),
- "doc_count" => row[column_indexes["hits"]].to_i,
- }
- end
-
- result.raw_result["aggregations"] ||= {}
- result.raw_result["aggregations"]["drilldown"] ||= {}
- result.raw_result["aggregations"]["drilldown"]["buckets"] = buckets
- end
- end
-
- def aggregate_by_drilldown_over_time!(prefix)
- # Grab the top 10 paths out of the query previously done inside
- # #aggregate_by_drilldown!
- #
- # A sub-query would probably be more efficient, but I'm still experiencing
- # this error in Kylin 1.2: https://issues.apache.org/jira/browse/KYLIN-814
- top_paths = []
- top_path_indexes = {}
- column_indexes = result.column_indexes(:drilldown)
- @query_results[:drilldown]["results"][0, 10].each_with_index do |row, index|
- value = row[column_indexes[@drilldown_depth_field]]
- top_path_indexes[value] = index
- if(@drilldown_depth == 0)
- value = value.chomp("/")
- end
- top_paths << value
- end
-
- # Get a date-based breakdown of the traffic to the top 10 paths.
- top_path_where = []
- if(top_paths.any?)
- top_path_where << @sequel.literal(Sequel.lit("#{@drilldown_depth_field} IN ?", top_paths))
- end
- execute_query(:top_path_hits_over_time, {
- :select => @drilldown_common_query[:select] + ["#{@interval_field} AS interval_field"],
- :where => @drilldown_common_query[:where] + top_path_where,
- :group_by => @drilldown_common_query[:group_by] + [@interval_field],
- :order_by => ["interval_field"],
- })
-
- # Get a date-based breakdown of the traffic to all paths. This is used to
- # come up with how much to allocate to the "Other" category (by subtracting
- # away the top 10 traffic). This probably isn't the best way to go about
- # this in SQL-land, but this is how we did things in ElasticSearch, so for
- # now, keep with that same approach.
- #
- # Since we want a sum of all the traffic, we need to remove the last level
- # of group by and selects (since that would give us per-path breakdowns,
- # and we only want totals).
- all_drilldown_select = @drilldown_common_query[:select][0..-2]
- all_drilldown_group_by = @drilldown_common_query[:group_by][0..-2]
- execute_query(:hits_over_time, {
- :select => all_drilldown_select + ["#{@interval_field} AS interval_field"],
- :where => @drilldown_common_query[:where],
- :group_by => all_drilldown_group_by + [@interval_field],
- :order_by => ["interval_field"],
- })
-
- # Massage the top_path_hits_over_time query into the aggregation format
- # matching our old elasticsearch queries.
- @result_processors << proc do |result|
- buckets = []
- column_indexes = result.column_indexes(:top_path_hits_over_time)
- result.raw_result[:top_path_hits_over_time]["results"].each do |row|
- # Store the hierarchy breakdown in the order of overall traffic (so the
- # path with the most traffic is always at the bottom of the graph).
- path_index = top_path_indexes[row[column_indexes[@drilldown_depth_field]]]
-
- # If the path index isn't set, skip this result for top hit processing.
- # This can happen when we're grouping by the top-level host, since our
- # query to match results returns everything matching the top hostnames,
- # however we may only be considering "example.com/" vs "example.com" as
- # part of the top hits.
- next unless(path_index)
-
- unless buckets[path_index]
- buckets[path_index] = {
- "key" => build_drilldown_prefix_from_result(row, column_indexes),
- "doc_count" => 0,
- "drilldown_over_time" => {
- "time_buckets" => {},
- },
- }
- end
-
- hits = row[column_indexes["hits"]].to_i
- time = strptime_in_zone(row[column_indexes["interval_field"]], @interval_field_format)
-
- buckets[path_index]["doc_count"] += hits
- buckets[path_index]["drilldown_over_time"]["time_buckets"][time.to_i] = {
- "key" => time.to_i * 1000,
- "key_as_string" => time.utc.iso8601,
- "doc_count" => hits,
- }
- end
-
- buckets.each do |bucket|
- bucket["drilldown_over_time"]["buckets"] = fill_in_time_buckets(bucket["drilldown_over_time"].delete("time_buckets"))
- end
-
- result.raw_result["aggregations"] ||= {}
- result.raw_result["aggregations"]["top_path_hits_over_time"] ||= {}
- result.raw_result["aggregations"]["top_path_hits_over_time"]["buckets"] = buckets
- end
-
- # Massage the hits_over_time query into the aggregation format matching our
- # old elasticsearch queries.
- @result_processors << proc do |result|
- time_buckets = {}
- column_indexes = result.column_indexes(:hits_over_time)
- result.raw_result[:hits_over_time]["results"].each do |row|
- time = strptime_in_zone(row[column_indexes["interval_field"]], @interval_field_format)
- time_buckets[time.to_i] = {
- "key" => time.to_i * 1000,
- "key_as_string" => time.utc.iso8601,
- "doc_count" => row[column_indexes["hits"]].to_i,
- }
- end
-
- result.raw_result["aggregations"] ||= {}
- result.raw_result["aggregations"]["hits_over_time"] ||= {}
- result.raw_result["aggregations"]["hits_over_time"]["buckets"] = fill_in_time_buckets(time_buckets)
- end
- end
-
- def aggregate_by_interval!
- execute_query(:hits_over_time, {
- :select => ["COUNT(*) AS hits", "#{@interval_field} AS interval_field"],
- :group_by => [@interval_field],
- :order_by => ["interval_field"],
- })
-
- # Massage the hits_over_time query into the aggregation format matching our
- # old elasticsearch queries.
- @result_processors << proc do |result|
- time_buckets = {}
- column_indexes = result.column_indexes(:hits_over_time)
- result.raw_result[:hits_over_time]["results"].each do |row|
- time = strptime_in_zone(row[column_indexes["interval_field"]], @interval_field_format)
- time_buckets[time.to_i] = {
- "key" => time.to_i * 1000,
- "key_as_string" => time.utc.iso8601,
- "doc_count" => row[column_indexes["hits"]].to_i,
- }
- end
-
- result.raw_result["aggregations"] ||= {}
- result.raw_result["aggregations"]["hits_over_time"] ||= {}
- result.raw_result["aggregations"]["hits_over_time"]["buckets"] = fill_in_time_buckets(time_buckets)
- end
- end
-
- def aggregate_by_region_field!(field)
- @query[:aggregations] ||= {}
- @query[:aggregations][:regions] = {
- :terms => {
- :field => field.to_s,
- },
- }
-
- field = field.to_s
- @query[:select] << "COUNT(*) AS hits"
- @query[:select] << @sequel.quote_identifier(field)
- @query[:group_by] << @sequel.quote_identifier(field)
-
- @result_processors << proc do |result|
- buckets = []
- null_count = 0
- column_indexes = result.column_indexes(:default)
- result.raw_result[:default]["results"].each do |row|
- region = row[column_indexes[field]]
- hits = row[column_indexes["hits"]].to_i
- if(region.nil?)
- null_count = hits
- else
- buckets << {
- "key" => region,
- "doc_count" => hits,
- }
- end
- end
-
- result.raw_result["aggregations"] ||= {}
- result.raw_result["aggregations"]["regions"] ||= {}
- result.raw_result["aggregations"]["regions"]["buckets"] = buckets
-
- result.raw_result["aggregations"]["missing_regions"] ||= {}
- result.raw_result["aggregations"]["missing_regions"]["doc_count"] = null_count
- end
- end
-
- def aggregate_by_country_regions!(country)
- @query[:where] << @sequel.literal(Sequel.lit("request_ip_country = ?", country))
- aggregate_by_region_field!(:request_ip_region)
- end
-
- def aggregate_by_us_state_cities!(country, state)
- @query[:where] << @sequel.literal(Sequel.lit("request_ip_country = ?", country))
- @query[:where] << @sequel.literal(Sequel.lit("request_ip_region = ?", state))
- aggregate_by_region_field!(:request_ip_city)
- end
-
- def aggregate_by_country_cities!(country)
- @query[:where] << @sequel.literal(Sequel.lit("request_ip_country = ?", country))
- aggregate_by_region_field!(:request_ip_city)
- end
-
- def aggregate_by_term!(field, size)
- field = field.to_s
-
- query_name_top = :"aggregate_by_term_#{field}_top"
- execute_query(query_name_top, {
- :select => ["COUNT(*) AS hits", @sequel.quote_identifier(field)],
- :where => ["#{@sequel.quote_identifier(field)} IS NOT NULL"],
- :group_by => [@sequel.quote_identifier(field)],
- :order_by => ["hits DESC"],
- :limit => size,
- })
-
- query_name_count = :"aggregate_by_term_#{field}_count"
- execute_query(query_name_count, {
- :select => ["COUNT(*) AS hits"],
- })
-
- query_name_null_count = :"aggregate_by_term_#{field}_null_count"
- # Optimization: Skip IS NULL query for any NOT NULL columns, since it
- # will always be 0.
- if(!NOT_NULL_FIELDS.include?(field))
- execute_query(query_name_null_count, {
- :select => ["COUNT(*) AS hits"],
- :where => ["#{@sequel.quote_identifier(field)} IS NULL"],
- })
- end
-
- @result_processors << proc do |result|
- buckets = []
- column_indexes = result.column_indexes(query_name_top)
- result.raw_result[query_name_top]["results"].each do |row|
- buckets << {
- "key" => row[column_indexes[field]],
- "doc_count" => row[column_indexes["hits"]].to_i,
- }
- end
-
- result.raw_result["aggregations"] ||= {}
- result.raw_result["aggregations"]["top_#{field.pluralize}"] ||= {}
- result.raw_result["aggregations"]["top_#{field.pluralize}"]["buckets"] = buckets
- end
-
- @result_processors << proc do |result|
- count = 0
- column_indexes = result.column_indexes(query_name_count)
- result.raw_result[query_name_count]["results"].each do |row|
- count = row[column_indexes["hits"]].to_i
- end
-
- result.raw_result["aggregations"] ||= {}
- result.raw_result["aggregations"]["value_count_#{field.pluralize}"] ||= {}
- result.raw_result["aggregations"]["value_count_#{field.pluralize}"]["value"] = count
- end
-
- @result_processors << proc do |result|
- count = 0
- # Still populate the NOT NULL fields with a 0 value for compatibility.
- if(!NOT_NULL_FIELDS.include?(field))
- column_indexes = result.column_indexes(query_name_null_count)
- result.raw_result[query_name_null_count]["results"].each do |row|
- count = row[column_indexes["hits"]].to_i
- end
- end
-
- result.raw_result["aggregations"] ||= {}
- result.raw_result["aggregations"]["missing_#{field.pluralize}"] ||= {}
- result.raw_result["aggregations"]["missing_#{field.pluralize}"]["doc_count"] = count
- end
- end
-
- def aggregate_by_cardinality!(field)
- field = field.to_s
- @queries[:default] ||= {}
- @queries[:default][:select] ||= []
- @queries[:default][:select] << "COUNT(DISTINCT #{@sequel.quote_identifier(field)}) AS #{@sequel.quote_identifier("#{field}_distinct_count")}"
-
- @result_processors << proc do |result|
- count = 0
- column_indexes = result.column_indexes(:default)
- result.raw_result[:default]["results"].each do |row|
- count = row[column_indexes["#{field}_distinct_count"]].to_i
- end
-
- result.raw_result["aggregations"] ||= {}
- result.raw_result["aggregations"]["unique_#{field.pluralize}"] ||= {}
- result.raw_result["aggregations"]["unique_#{field.pluralize}"]["value"] = count
- end
- end
-
- def aggregate_by_users!(size)
- aggregate_by_term!(:user_id, size)
- aggregate_by_cardinality!(:user_id)
-
- @result_processors << proc do |result|
- result.raw_result["aggregations"]["missing_user_emails"] = result.raw_result["aggregations"].delete("missing_user_ids")
- result.raw_result["aggregations"]["top_user_emails"] = result.raw_result["aggregations"].delete("top_user_ids")
- result.raw_result["aggregations"]["unique_user_emails"] = result.raw_result["aggregations"].delete("unique_user_ids")
- result.raw_result["aggregations"]["value_count_user_emails"] = result.raw_result["aggregations"].delete("value_count_user_ids")
-
- user_ids = result.raw_result["aggregations"]["top_user_emails"]["buckets"].map { |bucket| bucket["key"] }
- users_by_id = ApiUser.where(:id.in => user_ids).group_by { |u| u.id }
- result.raw_result["aggregations"]["top_user_emails"]["buckets"].each do |bucket|
- user = users_by_id[bucket["key"]]
- if(user && user.first)
- bucket["key"] = user.first.email
- end
- end
- end
- end
-
- def aggregate_by_request_ip!(size)
- aggregate_by_term!(:request_ip, size)
- aggregate_by_cardinality!(:request_ip)
- end
-
- def aggregate_by_user_stats!(options = {})
- @query[:select] << "COUNT(*) AS hits"
- @query[:select] << "MAX(timestamp_utc) AS last_request_at"
- @query[:select] << "user_id"
- @query[:group_by] << "user_id"
-
- if(options[:order])
- if(options[:order]["_count"] == "asc")
- @query[:order_by] << "hits ASC"
- elsif(options[:order]["_count"] == "desc")
- @query[:order_by] << "hits DESC"
- elsif(options[:order]["last_request_at"] == "asc")
- @query[:order_by] << "last_request_at ASC"
- elsif(options[:order]["last_request_at"] == "desc")
- @query[:order_by] << "last_request_at DESC"
- end
- end
-
- @result_processors << proc do |result|
- buckets = []
- column_indexes = result.column_indexes(:default)
- result.raw_result[:default]["results"].each do |row|
- buckets << {
- "key" => row[column_indexes["user_id"]],
- "doc_count" => row[column_indexes["hits"]].to_i,
- "last_request_at" => {
- "value" => row[column_indexes["last_request_at"]].to_f,
- "value_as_string" => Time.at(row[column_indexes["last_request_at"]].to_i / 1000.0).iso8601,
- },
- }
- end
-
- result.raw_result["aggregations"] ||= {}
- result.raw_result["aggregations"]["user_stats"] ||= {}
- result.raw_result["aggregations"]["user_stats"]["buckets"] = buckets
- end
- end
-
- def aggregate_by_response_time_average!
- @queries[:default] ||= {}
- @queries[:default][:select] ||= []
- @queries[:default][:select] << "AVG(timer_response) AS average_timer_response"
-
- @result_processors << proc do |result|
- average = 0
- column_indexes = result.column_indexes(:default)
- result.raw_result[:default]["results"].each do |row|
- average = row[column_indexes["average_timer_response"]].to_f
- end
-
- result.raw_result["aggregations"] ||= {}
- result.raw_result["aggregations"]["response_time_average"] ||= {}
- result.raw_result["aggregations"]["response_time_average"]["value"] = average
- end
- end
-
- def select_records!
- @needs_presto = true
- @query[:select] << "*"
-
- @result_processors << proc do |result|
- hits = []
- columns = result.raw_result[:default]["columnMetas"].map { |m| m["label"].downcase }
- result.raw_result[:default]["results"].each do |row|
- data = Hash[columns.zip(row)]
- data["request_url"] = "#{data["request_url_scheme"]}://#{data["request_url_host"]}:#{data["request_url_port"]}#{data["request_url_path"]}?#{data["request_url_query"]}"
- hits << { "_source" => data }
- end
-
- result.raw_result["hits"] ||= {}
- result.raw_result["hits"]["total"] = hits.length
- result.raw_result["hits"]["hits"] = hits
- end
- end
-
- private
-
- def fill_in_time_buckets(time_buckets)
- time = @start_time
- case @interval
- when "minute"
- time = time.change(:sec => 0)
- else
- time = time.send(:"beginning_of_#{@interval}")
- end
-
- buckets = []
- while(time <= @end_time)
- time_bucket = time_buckets[time.to_i]
- time_bucket ||= {
- "key" => time.to_i * 1000,
- "key_as_string" => time.utc.iso8601,
- "doc_count" => 0,
- }
-
- buckets << time_bucket
-
- time += 1.send(:"#{@interval}")
- end
-
- buckets
- end
-
- def build_drilldown_prefix_from_result(row, column_indexes)
- key = [@drilldown_depth.to_s]
- @drilldown_fields.each do |field|
- key << row[column_indexes[field]]
- end
- File.join(key)
- end
-
- # Perform strptime in the current timezone. Based on time-zone aware strptime
- # that's present in Rails 5 (but not Rails 3):
- # https://github.com/rails/rails/pull/19618
- #
- # If the current timezone is US Eastern, this allows for parsing something
- # like "2016-03-21" into "2016-03-21 04:00:00 UTC"
- def strptime_in_zone(string, format)
- t = Time.strptime(string, format)
- Time.zone.local(t.year, t.month, t.mday, t.hour, t.min, t.sec)
- end
-end
diff --git a/templates/etc/flume.properties.mustache b/templates/etc/flume.properties.mustache
deleted file mode 100644
index 021f9c414..000000000
--- a/templates/etc/flume.properties.mustache
+++ /dev/null
@@ -1,42 +0,0 @@
-api-umbrella-agent.sources = kafka-source
-api-umbrella-agent.channels = file-channel
-api-umbrella-agent.sinks = hdfs-sink
-
-# Use file storage for buffering data.
-api-umbrella-agent.channels.file-channel.type = file
-api-umbrella-agent.channels.file-channel.checkpointDir = {{db_dir}}/flume/file-channel/checkpoint
-api-umbrella-agent.channels.file-channel.dataDirs = {{db_dir}}/flume/file-channel/data
-api-umbrella-agent.channels.file-channel.use-fast-replay = true
-
-# Pull data from Kafka (which rsyslog is pumping data into).
-api-umbrella-agent.sources.kafka-source.type = org.apache.flume.source.kafka.KafkaSource
-api-umbrella-agent.sources.kafka-source.channels = file-channel
-api-umbrella-agent.sources.kafka-source.zookeeperConnect = 127.0.0.1:2181
-api-umbrella-agent.sources.kafka-source.groupId = api-umbrella-flume
-api-umbrella-agent.sources.kafka-source.topic = api_umbrella_logs
-
-# Extract the timestamp_utc from the JSON data.
-api-umbrella-agent.sources.kafka-source.interceptors = timestamp-extractor
-api-umbrella-agent.sources.kafka-source.interceptors.timestamp-extractor.type = regex_extractor
-api-umbrella-agent.sources.kafka-source.interceptors.timestamp-extractor.regex = "timestamp_utc": ?(\\d+)
-api-umbrella-agent.sources.kafka-source.interceptors.timestamp-extractor.serializers = timestamp-serializer
-api-umbrella-agent.sources.kafka-source.interceptors.timestamp-extractor.serializers.timestamp-serializer.type = org.apache.flume.interceptor.RegexExtractorInterceptorPassThroughSerializer
-api-umbrella-agent.sources.kafka-source.interceptors.timestamp-extractor.serializers.timestamp-serializer.name = timestamp
-
-# Write data to HDFS.
-api-umbrella-agent.sinks.hdfs-sink.type = hdfs
-api-umbrella-agent.sinks.hdfs-sink.channel = file-channel
-api-umbrella-agent.sinks.hdfs-sink.hdfs.path = /apps/api-umbrella/logs-live/timestamp_tz_date=%Y-%m-%d/timestamp_tz_hour_minute=%H-%M
-# Use "_" prefix for files currently being written to. This prevent Hive from
-# picking these files up before they're fully written (otherwise, the partial
-# gzip state results in invalid data).
-api-umbrella-agent.sinks.hdfs-sink.hdfs.inUsePrefix = _
-# Write compressed JSON files.
-api-umbrella-agent.sinks.hdfs-sink.hdfs.fileType = CompressedStream
-api-umbrella-agent.sinks.hdfs-sink.hdfs.codeC = gzip
-api-umbrella-agent.sinks.hdfs-sink.hdfs.fileSuffix = .json.gz
-# Write out the file ever 15 seconds.
-api-umbrella-agent.sinks.hdfs-sink.hdfs.rollInterval = 15
-api-umbrella-agent.sinks.hdfs-sink.hdfs.rollSize = 0
-api-umbrella-agent.sinks.hdfs-sink.hdfs.rollCount = 0
-api-umbrella-agent.sinks.hdfs-sink.hdfs.timeZone = {{analytics.timezone}}
diff --git a/templates/etc/kylin/kylin.properties b/templates/etc/kylin/kylin.properties
deleted file mode 100644
index 9a861dbfe..000000000
--- a/templates/etc/kylin/kylin.properties
+++ /dev/null
@@ -1,174 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# kylin server's mode
-kylin.server.mode=all
-
-# optional information for the owner of kylin platform, it can be your team's email
-# currently it will be attached to each kylin's htable attribute
-kylin.owner=whoami@kylin.apache.org
-
-# List of web servers in use, this enables one web server instance to sync up with other servers.
-kylin.rest.servers=localhost:7070
-
-# The metadata store in hbase
-kylin.metadata.url=kylin_metadata@hbase
-
-# The storage for final cube file in hbase
-kylin.storage.url=hbase
-
-# Temp folder in hdfs, make sure user has the right access to the hdfs directory
-kylin.hdfs.working.dir=/kylin
-
-# HBase Cluster FileSystem, which serving hbase, format as hdfs://hbase-cluster:8020
-# leave empty if hbase running on same cluster with hive and mapreduce
-kylin.hbase.cluster.fs=
-
-kylin.job.mapreduce.default.reduce.input.mb=500
-
-# max job retry on error, default 0: no retry
-kylin.job.retry=0
-
-# If true, job engine will not assume that hadoop CLI reside on the same server as it self
-# you will have to specify kylin.job.remote.cli.hostname, kylin.job.remote.cli.username and kylin.job.remote.cli.password
-# It should not be set to "true" unless you're NOT running Kylin.sh on a hadoop client machine
-# (Thus kylin instance has to ssh to another real hadoop client machine to execute hbase,hive,hadoop commands)
-kylin.job.run.as.remote.cmd=false
-
-# Only necessary when kylin.job.run.as.remote.cmd=true
-kylin.job.remote.cli.hostname=
-
-# Only necessary when kylin.job.run.as.remote.cmd=true
-kylin.job.remote.cli.username=
-
-# Only necessary when kylin.job.run.as.remote.cmd=true
-kylin.job.remote.cli.password=
-
-# Used by test cases to prepare synthetic data for sample cube
-kylin.job.remote.cli.working.dir=/tmp/kylin
-
-# Max count of concurrent jobs running
-kylin.job.concurrent.max.limit=10
-
-# Time interval to check hadoop job status
-kylin.job.yarn.app.rest.check.interval.seconds=10
-
-# Hive database name for putting the intermediate flat tables
-kylin.job.hive.database.for.intermediatetable=default
-
-#default compression codec for htable,snappy,lzo,gzip,lz4
-kylin.hbase.default.compression.codec=snappy
-
-#the percentage of the sampling, default 100%
-kylin.job.cubing.inmem.sampling.percent=100
-
-# The cut size for hbase region, in GB.
-kylin.hbase.region.cut=5
-
-# The hfile size of GB, smaller hfile leading to the converting hfile MR has more reducers and be faster
-# set 0 to disable this optimization
-kylin.hbase.hfile.size.gb=2
-
-# Enable/disable ACL check for cube query
-kylin.query.security.enabled=true
-
-# whether get job status from resource manager with kerberos authentication
-kylin.job.status.with.kerberos=false
-
-## kylin security configurations
-
-# spring security profile, options: testing, ldap, saml
-# with "testing" profile, user can use pre-defined name/pwd like KYLIN/ADMIN to login
-kylin.security.profile=testing
-
-# default roles and admin roles in LDAP, for ldap and saml
-acl.defaultRole=ROLE_ANALYST,ROLE_MODELER
-acl.adminRole=ROLE_ADMIN
-
-#LDAP authentication configuration
-ldap.server=ldap://ldap_server:389
-ldap.username=
-ldap.password=
-
-#LDAP user account directory;
-ldap.user.searchBase=
-ldap.user.searchPattern=
-ldap.user.groupSearchBase=
-
-#LDAP service account directory
-ldap.service.searchBase=
-ldap.service.searchPattern=
-ldap.service.groupSearchBase=
-
-#SAML configurations for SSO
-# SAML IDP metadata file location
-saml.metadata.file=classpath:sso_metadata.xml
-saml.metadata.entityBaseURL=https://hostname/kylin
-saml.context.scheme=https
-saml.context.serverName=hostname
-saml.context.serverPort=443
-saml.context.contextPath=/kylin
-
-
-ganglia.group=
-ganglia.port=8664
-
-## Config for mail service
-
-# If true, will send email notification;
-mail.enabled=false
-mail.host=
-mail.username=
-mail.password=
-mail.sender=
-
-###########################config info for web#######################
-
-#help info ,format{name|displayName|link} ,optional
-kylin.web.help.length=4
-kylin.web.help.0=start|Getting Started|
-kylin.web.help.1=odbc|ODBC Driver|
-kylin.web.help.2=tableau|Tableau Guide|
-kylin.web.help.3=onboard|Cube Design Tutorial|
-
-#guide user how to build streaming cube
-kylin.web.streaming.guide=http://kylin.apache.org/
-
-#hadoop url link ,optional
-kylin.web.hadoop=
-#job diagnostic url link ,optional
-kylin.web.diagnostic=
-#contact mail on web page ,optional
-kylin.web.contact_mail=
-
-###########################config info for front#######################
-
-#env DEV|QA|PROD
-deploy.env=QA
-
-###########################deprecated configs#######################
-kylin.sandbox=true
-kylin.web.hive.limit=20
-# The cut size for hbase region,
-#in GB.
-# E.g, for cube whose capacity be marked as "SMALL", split region per 5GB by default
-kylin.hbase.region.cut.small=5
-kylin.hbase.region.cut.medium=10
-kylin.hbase.region.cut.large=50
-
-# Custom Config
-kylin.rest.timezone=GMT
diff --git a/templates/etc/kylin/kylin_hive_conf.xml b/templates/etc/kylin/kylin_hive_conf.xml
deleted file mode 100644
index 3682c4760..000000000
--- a/templates/etc/kylin/kylin_hive_conf.xml
+++ /dev/null
@@ -1,94 +0,0 @@
-
-
-
- dfs.replication
- 1
- Block replication
-
-
-
- dfs.block.size
- 33554432
- 32MB, want more mappers for in-mem cubing, thus smaller the DFS block size
-
-
-
- hive.exec.compress.output
- true
- enable compress
-
-
-
- hive.auto.convert.join.noconditionaltask
- true
- enable map-side join
-
-
-
- hive.auto.convert.join.noconditionaltask.size
- 300000000
- enable map-side join
-
-
-
- mapreduce.map.output.compress.codec
- org.apache.hadoop.io.compress.SnappyCodec
-
-
-
- mapreduce.output.fileoutputformat.compress.codec
- org.apache.hadoop.io.compress.SnappyCodec
-
-
-
-
- hive.merge.mapfiles
- true
- Enable hive file merge on mapper only job
-
-
- hive.merge.mapredfiles
- true
- Enable hive file merge on map-reduce job
-
-
- mapred.output.compression.type
- BLOCK
- The compression type to use for job outputs
-
-
- hive.merge.size.per.task
- 256000000
- Size for the merged file
-
-
-
- hive.support.concurrency
- false
- Hive concurrency lock
-
-
-
- mapreduce.job.split.metainfo.maxsize
- -1
- The maximum permissible size of the split metainfo file.
- The JobTracker won't attempt to read split metainfo files bigger than
- the configured value. No limits if set to -1.
-
-
-
diff --git a/templates/etc/kylin/kylin_job_conf.xml b/templates/etc/kylin/kylin_job_conf.xml
deleted file mode 100644
index fd89584d5..000000000
--- a/templates/etc/kylin/kylin_job_conf.xml
+++ /dev/null
@@ -1,113 +0,0 @@
-
-
-
-
- mapreduce.job.split.metainfo.maxsize
- -1
- The maximum permissible size of the split metainfo file.
- The JobTracker won't attempt to read split metainfo files bigger than
- the configured value. No limits if set to -1.
-
-
-
-
- mapred.compress.map.output
- true
- Compress map outputs
-
-
-
- mapred.map.output.compression.codec
- org.apache.hadoop.io.compress.SnappyCodec
- The compression codec to use for map outputs
-
-
-
-
- mapred.output.compress
- true
- Compress the output of a MapReduce job
-
-
-
- mapred.output.compression.codec
- org.apache.hadoop.io.compress.SnappyCodec
- The compression codec to use for job outputs
-
-
-
-
- mapred.output.compression.type
- BLOCK
- The compression type to use for job outputs
-
-
-
-
- mapreduce.job.max.split.locations
- 2000
- No description
-
-
-
- dfs.replication
- 1
- Block replication
-
-
-
- mapred.task.timeout
- 3600000
- Set task timeout to 1 hour
-
-
-
- mapreduce.map.memory.mb
- 1024
-
-
-
-
- mapreduce.map.java.opts
- -Xmx750m
-
-
-
-
-
- mapreduce.map.memory.mb
- 3072
-
-
-
-
- mapreduce.map.java.opts
- -Xmx2700m
-
-
-
-
- mapreduce.task.io.sort.mb
- 200
-
-
-
-
diff --git a/templates/etc/kylin/kylin_job_conf_inmem.xml b/templates/etc/kylin/kylin_job_conf_inmem.xml
deleted file mode 100644
index ec4a202c0..000000000
--- a/templates/etc/kylin/kylin_job_conf_inmem.xml
+++ /dev/null
@@ -1,98 +0,0 @@
-
-
-
-
- mapreduce.job.split.metainfo.maxsize
- -1
- The maximum permissible size of the split metainfo file.
- The JobTracker won't attempt to read split metainfo files bigger than
- the configured value. No limits if set to -1.
-
-
-
-
- mapred.compress.map.output
- true
- Compress map outputs
-
-
-
- mapred.map.output.compression.codec
- org.apache.hadoop.io.compress.SnappyCodec
- The compression codec to use for map outputs
-
-
-
-
- mapred.output.compress
- true
- Compress the output of a MapReduce job
-
-
-
- mapred.output.compression.codec
- org.apache.hadoop.io.compress.SnappyCodec
- The compression codec to use for job outputs
-
-
-
-
- mapred.output.compression.type
- BLOCK
- The compression type to use for job outputs
-
-
-
-
- mapreduce.job.max.split.locations
- 2000
- No description
-
-
-
- dfs.replication
- 1
- Block replication
-
-
-
- mapred.task.timeout
- 3600000
- Set task timeout to 1 hour
-
-
-
-
- mapreduce.map.memory.mb
- 3072
-
-
-
-
- mapreduce.map.java.opts
- -Xmx2700m
-
-
-
-
- mapreduce.task.io.sort.mb
- 200
-
-
-
-
diff --git a/templates/etc/perp/flume/rc.env.mustache b/templates/etc/perp/flume/rc.env.mustache
deleted file mode 100644
index 045b49ba8..000000000
--- a/templates/etc/perp/flume/rc.env.mustache
+++ /dev/null
@@ -1 +0,0 @@
-JAVA_OPTS={{flume.embedded_server_config.java_opts}}
diff --git a/templates/etc/perp/flume/rc.log b/templates/etc/perp/flume/rc.log
deleted file mode 100755
index 089baa65a..000000000
--- a/templates/etc/perp/flume/rc.log
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env bash
-exec ../rc.log "$@"
diff --git a/templates/etc/perp/flume/rc.main.mustache b/templates/etc/perp/flume/rc.main.mustache
deleted file mode 100755
index ae252bcfd..000000000
--- a/templates/etc/perp/flume/rc.main.mustache
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash
-
-# Redirect stderr to stdout
-exec 2>&1
-
-if [ "${1}" = "start" ]; then
- echo "starting ${2}..."
- api_umbrella_user="{{user}}"
-
- run_args=("-e" "rc.env")
- if [ -n "$api_umbrella_user" ]; then
- run_args+=("-u" "$api_umbrella_user")
- fi
-
- exec runtool "${run_args[@]}" "{{_embedded_root_dir}}/flume/bin/flume-ng" \
- agent \
- --name api-umbrella-agent \
- --conf "{{_embedded_root_dir}}/flume/conf" \
- --conf-file "{{etc_dir}}/flume.properties" \
- -Dflume.root.logger=INFO,console
-fi
-
-exit 0
diff --git a/templates/etc/perp/kylin/rc.env.mustache b/templates/etc/perp/kylin/rc.env.mustache
deleted file mode 100644
index fa4994032..000000000
--- a/templates/etc/perp/kylin/rc.env.mustache
+++ /dev/null
@@ -1,4 +0,0 @@
-KYLIN_HOME={{_embedded_root_dir}}/kylin
-# HBase maxsize must be set higher in Kylin 1.5+ to avoid HBase KeyValue size
-# too large errors.
-KYLIN_CUSTOM_START_OPTS=-Dkylin.hbase.client.keyvalue.maxsize=30485760 -DKYLIN_CONF={{etc_dir}}/kylin
diff --git a/templates/etc/perp/kylin/rc.log b/templates/etc/perp/kylin/rc.log
deleted file mode 100755
index 089baa65a..000000000
--- a/templates/etc/perp/kylin/rc.log
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env bash
-exec ../rc.log "$@"
diff --git a/templates/etc/perp/kylin/rc.main.mustache b/templates/etc/perp/kylin/rc.main.mustache
deleted file mode 100755
index 7e8852f01..000000000
--- a/templates/etc/perp/kylin/rc.main.mustache
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env bash
-
-# Redirect stderr to stdout
-exec 2>&1
-
-if [ "${1}" = "start" ]; then
- echo "starting ${2}..."
- api_umbrella_user="{{user}}"
-
- run_args=("-e" "rc.env")
- if [ -n "$api_umbrella_user" ]; then
- run_args+=("-u" "$api_umbrella_user")
- fi
-
- exec runtool "${run_args[@]}" "{{etc_dir}}/perp/kylin/rc.run"
-fi
-
-exit 0
diff --git a/templates/etc/perp/kylin/rc.run.mustache b/templates/etc/perp/kylin/rc.run.mustache
deleted file mode 100755
index 3979bbeeb..000000000
--- a/templates/etc/perp/kylin/rc.run.mustache
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env bash
-
-# A script to start Kylin, but in the foreground. This is basically a copy of
-# the normal Kylin start script
-# (version 1.2: https://github.com/apache/kylin/blob/kylin-1.2/bin/kylin.sh),
-# but modified so that it doesn't background the kylin process. This provides a
-# perp-compatible way to start Kylin (since it requires foreground processes).
-#
-# We should see about submitting a "run" pull request to Kylin to provide
-# foreground based starting directly in Kylin so we can get rid of this script.
-
-dir="{{_embedded_root_dir}}/kylin/bin"
-source ${dir}/check-env.sh
-mkdir -p ${KYLIN_HOME}/logs
-
-tomcat_root=${dir}/../tomcat
-export tomcat_root
-
-rm -rf ${tomcat_root}/webapps/kylin
-
-spring_profile=`sh ${dir}/get-properties.sh kylin.security.profile`
-if [ -z "$spring_profile" ]
-then
- echo 'please set kylin.security.profile in kylin.properties, options are: testing, ldap, saml.'
- exit 1
-else
- echo "kylin.security.profile is set to $spring_profile"
-fi
-
-source ${dir}/find-hive-dependency.sh
-source ${dir}/find-hbase-dependency.sh
-if [ -f "${dir}/setenv.sh" ]
- then source ${dir}/setenv.sh
-fi
-
-export HBASE_CLASSPATH_PREFIX=${tomcat_root}/bin/bootstrap.jar:${tomcat_root}/bin/tomcat-juli.jar:${tomcat_root}/lib/*:$HBASE_CLASSPATH_PREFIX
-mkdir -p ${KYLIN_HOME}/ext
-export HBASE_CLASSPATH=$hive_dependency:${KYLIN_HOME}/lib/*:${KYLIN_HOME}/ext/*:${HBASE_CLASSPATH}
-
-exec \
- hbase \
- ${KYLIN_EXTRA_START_OPTS} \
- ${KYLIN_CUSTOM_START_OPTS} \
- -Djava.util.logging.config.file=${tomcat_root}/conf/logging.properties \
- -Djava.util.logging.manager=org.apache.juli.ClassLoaderLogManager \
- -Dorg.apache.tomcat.util.buf.UDecoder.ALLOW_ENCODED_SLASH=true \
- -Dorg.apache.catalina.connector.CoyoteAdapter.ALLOW_BACKSLASH=true \
- -Djava.endorsed.dirs=${tomcat_root}/endorsed \
- -Dcatalina.base=${tomcat_root} \
- -Dcatalina.home=${tomcat_root} \
- -Djava.io.tmpdir=${tomcat_root}/temp \
- -Dkylin.hive.dependency=${hive_dependency} \
- -Dkylin.hbase.dependency=${hbase_dependency} \
- -Dspring.profiles.active=${spring_profile} \
- org.apache.hadoop.util.RunJar ${tomcat_root}/bin/bootstrap.jar org.apache.catalina.startup.Bootstrap start
diff --git a/templates/etc/perp/presto/rc.log b/templates/etc/perp/presto/rc.log
deleted file mode 100755
index 089baa65a..000000000
--- a/templates/etc/perp/presto/rc.log
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env bash
-exec ../rc.log "$@"
diff --git a/templates/etc/perp/presto/rc.main.mustache b/templates/etc/perp/presto/rc.main.mustache
deleted file mode 100755
index 7f8bf8512..000000000
--- a/templates/etc/perp/presto/rc.main.mustache
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Redirect stderr to stdout
-exec 2>&1
-
-if [ "${1}" = "start" ]; then
- echo "starting ${2}..."
- api_umbrella_user="{{user}}"
-
- run_args=()
- if [ -n "$api_umbrella_user" ]; then
- run_args+=("-u" "$api_umbrella_user")
- fi
-
- exec runtool "${run_args[@]}" "{{_embedded_root_dir}}/presto/bin/launcher" run \
- --node-config="{{etc_dir}}/presto/node.properties" \
- --jvm-config="{{etc_dir}}/presto/jvm.config" \
- --config="{{etc_dir}}/presto/config.properties" \
- --log-levels-file="{{etc_dir}}/presto/log.properties"
-fi
-
-exit 0
diff --git a/templates/etc/presto/catalog/hive.properties.mustache b/templates/etc/presto/catalog/hive.properties.mustache
deleted file mode 100644
index 25a3291d7..000000000
--- a/templates/etc/presto/catalog/hive.properties.mustache
+++ /dev/null
@@ -1,2 +0,0 @@
-connector.name=hive-hadoop2
-hive.metastore.uri={{presto.embedded_server_config.hive.metastore_uri}}
diff --git a/templates/etc/presto/config.properties.mustache b/templates/etc/presto/config.properties.mustache
deleted file mode 100644
index c08431cf4..000000000
--- a/templates/etc/presto/config.properties.mustache
+++ /dev/null
@@ -1,12 +0,0 @@
-coordinator=true
-node-scheduler.include-coordinator=true
-http-server.http.port={{presto.embedded_server_config.http_port}}
-http-server.log.path={{log_dir}}/presto/http-request.log
-query.max-memory={{presto.embedded_server_config.max_memory}}
-query.max-memory-per-node={{presto.embedded_server_config.max_memory_per_node}}
-discovery-server.enabled=true
-discovery.uri={{presto.embedded_server_config.discovery_uri}}
-catalog.config-dir={{etc_dir}}/presto/catalog
-
-# Limit how long individual queries can run.
-query.max-run-time={{presto.embedded_server_config.query_max_run_time}}
diff --git a/templates/etc/presto/jvm.config.mustache b/templates/etc/presto/jvm.config.mustache
deleted file mode 100644
index ac29e837b..000000000
--- a/templates/etc/presto/jvm.config.mustache
+++ /dev/null
@@ -1,8 +0,0 @@
--server
--Xmx2G
--XX:+UseG1GC
--XX:G1HeapRegionSize=32M
--XX:+UseGCOverheadLimit
--XX:+ExplicitGCInvokesConcurrent
--XX:+HeapDumpOnOutOfMemoryError
--XX:OnOutOfMemoryError=kill -9 %p
diff --git a/templates/etc/presto/log.properties.mustache b/templates/etc/presto/log.properties.mustache
deleted file mode 100644
index 1c52627f6..000000000
--- a/templates/etc/presto/log.properties.mustache
+++ /dev/null
@@ -1 +0,0 @@
-com.facebook.presto=INFO
diff --git a/templates/etc/presto/node.properties.mustache b/templates/etc/presto/node.properties.mustache
deleted file mode 100644
index f82801ed1..000000000
--- a/templates/etc/presto/node.properties.mustache
+++ /dev/null
@@ -1,3 +0,0 @@
-node.environment=production
-node.id=ffffffff-ffff-ffff-ffff-ffffffffffff
-node.data-dir={{db_dir}}/presto
diff --git a/templates/etc/rsyslog.conf.mustache b/templates/etc/rsyslog.conf.mustache
index 03baa2faf..62c267e5d 100644
--- a/templates/etc/rsyslog.conf.mustache
+++ b/templates/etc/rsyslog.conf.mustache
@@ -226,181 +226,6 @@ local0.info action(
)
{{/analytics._output_elasticsearch?}}
-{{#analytics._output_kylin?}}
-module(load="omkafka")
-
-# Define the SQL-based output.
-if($!raw!denied_reason != "") then {
- set $!usr!sql!denied_reason = $!raw!denied_reason;
-}
-if($!raw!id != "") then {
- set $!usr!sql!id = $!raw!id;
-}
-if($!raw!request_accept != "") then {
- set $!usr!sql!request_accept = $!raw!request_accept;
-}
-if($!raw!request_accept_encoding != "") then {
- set $!usr!sql!request_accept_encoding = $!raw!request_accept_encoding;
-}
-if($!raw!request_basic_auth_username != "") then {
- set $!usr!sql!request_basic_auth_username = $!raw!request_basic_auth_username;
-}
-if($!raw!request_connection != "") then {
- set $!usr!sql!request_connection = $!raw!request_connection;
-}
-if($!raw!request_content_type != "") then {
- set $!usr!sql!request_content_type = $!raw!request_content_type;
-}
-if($!raw!request_ip != "") then {
- set $!usr!sql!request_ip = $!raw!request_ip;
-}
-if($!raw!request_ip_city != "") then {
- set $!usr!sql!request_ip_city = $!raw!request_ip_city;
-}
-if($!raw!request_ip_country != "") then {
- set $!usr!sql!request_ip_country = $!raw!request_ip_country;
-}
-if($!raw!request_ip_lat != "") then {
- set $!usr!sql!request_ip_lat = $!raw!request_ip_lat;
-}
-if($!raw!request_ip_lon != "") then {
- set $!usr!sql!request_ip_lon = $!raw!request_ip_lon;
-}
-if($!raw!request_ip_region != "") then {
- set $!usr!sql!request_ip_region = $!raw!request_ip_region;
-}
-if($!raw!request_method != "") then {
- set $!usr!sql!request_method = $!raw!request_method;
-}
-if($!raw!request_origin != "") then {
- set $!usr!sql!request_origin = $!raw!request_origin;
-}
-if($!raw!request_referer != "") then {
- set $!usr!sql!request_referer = $!raw!request_referer;
-}
-if($!raw!request_size != "") then {
- set $!usr!sql!request_size = $!raw!request_size;
-}
-if($!raw!request_url_host != "") then {
- set $!usr!sql!request_url_host = $!raw!request_url_host;
-}
-if($!raw!request_url_path != "") then {
- set $!usr!sql!request_url_path = $!raw!request_url_path;
-}
-if($!raw!request_url_path_level1 != "") then {
- set $!usr!sql!request_url_path_level1 = $!raw!request_url_path_level1;
-}
-if($!raw!request_url_path_level2 != "") then {
- set $!usr!sql!request_url_path_level2 = $!raw!request_url_path_level2;
-}
-if($!raw!request_url_path_level3 != "") then {
- set $!usr!sql!request_url_path_level3 = $!raw!request_url_path_level3;
-}
-if($!raw!request_url_path_level4 != "") then {
- set $!usr!sql!request_url_path_level4 = $!raw!request_url_path_level4;
-}
-if($!raw!request_url_path_level5 != "") then {
- set $!usr!sql!request_url_path_level5 = $!raw!request_url_path_level5;
-}
-if($!raw!request_url_path_level6 != "") then {
- set $!usr!sql!request_url_path_level6 = $!raw!request_url_path_level6;
-}
-if($!raw!request_url_query != "") then {
- set $!usr!sql!request_url_query = $!raw!request_url_query;
-}
-if($!raw!request_url_scheme != "") then {
- set $!usr!sql!request_url_scheme = $!raw!request_url_scheme;
-}
-if($!raw!request_user_agent != "") then {
- set $!usr!sql!request_user_agent = $!raw!request_user_agent;
-}
-if($!raw!request_user_agent_family != "") then {
- set $!usr!sql!request_user_agent_family = $!raw!request_user_agent_family;
-}
-if($!raw!request_user_agent_type != "") then {
- set $!usr!sql!request_user_agent_type = $!raw!request_user_agent_type;
-}
-if($!raw!response_age != "") then {
- set $!usr!sql!response_age = $!raw!response_age;
-}
-if($!raw!response_cache != "") then {
- set $!usr!sql!response_cache = $!raw!response_cache;
-}
-if($!raw!response_content_encoding != "") then {
- set $!usr!sql!response_content_encoding = $!raw!response_content_encoding;
-}
-if($!raw!response_content_length != "") then {
- set $!usr!sql!response_content_length = $!raw!response_content_length;
-}
-if($!raw!response_content_type != "") then {
- set $!usr!sql!response_content_type = $!raw!response_content_type;
-}
-if($!raw!response_server != "") then {
- set $!usr!sql!response_server = $!raw!response_server;
-}
-if($!raw!response_size != "") then {
- set $!usr!sql!response_size = $!raw!response_size;
-}
-if($!raw!response_status != "") then {
- set $!usr!sql!response_status = $!raw!response_status;
-}
-if($!raw!response_transfer_encoding != "") then {
- set $!usr!sql!response_transfer_encoding = $!raw!response_transfer_encoding;
-}
-if($!raw!timer_response != "") then {
- set $!usr!sql!timer_response = $!raw!timer_response;
-}
-if($!raw!timestamp_tz_date != "") then {
- set $!usr!sql!timestamp_tz_date = $!raw!timestamp_tz_date;
-}
-if($!raw!timestamp_tz_hour != "") then {
- set $!usr!sql!timestamp_tz_hour = $!raw!timestamp_tz_hour;
-}
-if($!raw!timestamp_tz_minute != "") then {
- set $!usr!sql!timestamp_tz_minute = $!raw!timestamp_tz_minute;
-}
-if($!raw!timestamp_tz_month != "") then {
- set $!usr!sql!timestamp_tz_month = $!raw!timestamp_tz_month;
-}
-if($!raw!timestamp_tz_offset != "") then {
- set $!usr!sql!timestamp_tz_offset = $!raw!timestamp_tz_offset;
-}
-if($!raw!timestamp_tz_week != "") then {
- set $!usr!sql!timestamp_tz_week = $!raw!timestamp_tz_week;
-}
-if($!raw!timestamp_tz_year != "") then {
- set $!usr!sql!timestamp_tz_year = $!raw!timestamp_tz_year;
-}
-if($!raw!timestamp_utc != "") then {
- set $!usr!sql!timestamp_utc = $!raw!timestamp_utc;
-}
-if($!raw!user_id != "") then {
- set $!usr!sql!user_id = $!raw!user_id;
-}
-
-# Define templates for SQL output via JSON records.
-template(name="sql-json-record" type="subtree" subtree="$!usr!sql")
-
-# Output to Kafka.
-# A disk-assisted memory queue is used for buffering.
-local0.info action(
- name="output-kafka"
- type="omkafka"
- broker=[{{kafka._rsyslog_broker}}]
- topic="{{kafka.topic}}"
- confParam=["compression.codec=gzip"]
- template="sql-json-record"
- queue.type="LinkedList"
- queue.filename="queue-kafka"
- queue.checkpointinterval="10"
- queue.saveonshutdown="on"
- queue.size="15000"
- queue.highwatermark="10000"
- queue.lowwatermark="2000"
- action.resumeRetryCount="-1"
-)
-{{/analytics._output_kylin?}}
-
# Output to local file for redundancy until we prove the new system out.
template(name="all-json-record" type="list") {
property(name="$!raw") constant(value="\n")
diff --git a/test/admin_ui/test_legacy_redirects.rb b/test/admin_ui/test_legacy_redirects.rb
index d47799b3d..ee1b019e5 100644
--- a/test/admin_ui/test_legacy_redirects.rb
+++ b/test/admin_ui/test_legacy_redirects.rb
@@ -16,7 +16,7 @@ def setup
def test_drilldown
admin_login
- visit "/admin/#/stats/drilldown/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour&beta_analytics=false®ion=US"
+ visit "/admin/#/stats/drilldown/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour®ion=US"
assert_link("Download CSV", :href => /start_at=2015-01-15/)
assert_current_admin_url("/stats/drilldown", {
@@ -29,7 +29,7 @@ def test_drilldown
def test_logs
admin_login
- visit "/admin/#/stats/logs/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour&beta_analytics=false®ion=US"
+ visit "/admin/#/stats/logs/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour®ion=US"
assert_link("Download CSV", :href => /start_at=2015-01-15/)
assert_current_admin_url("/stats/logs", {
@@ -42,7 +42,7 @@ def test_logs
def test_users
admin_login
- visit "/admin/#/stats/users/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour&beta_analytics=false®ion=US"
+ visit "/admin/#/stats/users/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour®ion=US"
assert_link("Download CSV", :href => /start_at=2015-01-15/)
assert_current_admin_url("/stats/users", {
@@ -54,7 +54,7 @@ def test_users
def test_map
admin_login
- visit "/admin/#/stats/map/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour&beta_analytics=false®ion=US"
+ visit "/admin/#/stats/map/tz=America%2FDenver&search=&start_at=2015-01-15&end_at=2015-01-18&query=%7B%22condition%22%3A%22AND%22%2C%22rules%22%3A%5B%7B%22id%22%3A%22gatekeeper_denied_code%22%2C%22field%22%3A%22gatekeeper_denied_code%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22select%22%2C%22operator%22%3A%22is_null%22%2C%22value%22%3Anull%7D%2C%7B%22id%22%3A%22request_host%22%2C%22field%22%3A%22request_host%22%2C%22type%22%3A%22string%22%2C%22input%22%3A%22text%22%2C%22operator%22%3A%22begins_with%22%2C%22value%22%3A%22example.com%22%7D%5D%7D&interval=hour®ion=US"
assert_link("Download CSV", :href => /start_at=2015-01-15/)
assert_current_admin_url("/stats/map", {
diff --git a/test/admin_ui/test_stats_drilldown.rb b/test/admin_ui/test_stats_drilldown.rb
index 5a5fc3ac3..8e0feeecd 100644
--- a/test/admin_ui/test_stats_drilldown.rb
+++ b/test/admin_ui/test_stats_drilldown.rb
@@ -43,7 +43,6 @@ def test_csv_download
"search" => "",
"query" => default_query,
"prefix" => "0/",
- "beta_analytics" => "false",
}, uri.query_values.except("api_key"))
assert_equal(40, uri.query_values["api_key"].length)
diff --git a/test/admin_ui/test_stats_logs.rb b/test/admin_ui/test_stats_logs.rb
index ea4cca855..21a79cdaa 100644
--- a/test/admin_ui/test_stats_logs.rb
+++ b/test/admin_ui/test_stats_logs.rb
@@ -59,7 +59,6 @@ def test_csv_download_link_changes_with_filters
"end_at" => "2015-01-18",
"interval" => "day",
"query" => default_query,
- "beta_analytics" => "false",
}, uri.query_values)
visit "/admin/#/stats/logs?search=&start_at=2015-01-13&end_at=2015-01-18&interval=day"
@@ -74,7 +73,6 @@ def test_csv_download_link_changes_with_filters
"end_at" => "2015-01-18",
"interval" => "day",
"query" => default_query,
- "beta_analytics" => "false",
}, uri.query_values)
visit "/admin/#/stats/logs?search=&start_at=2015-01-12&end_at=2015-01-18&interval=day"
@@ -94,7 +92,6 @@ def test_csv_download_link_changes_with_filters
"interval" => "day",
"query" => JSON.generate({ "condition" => "AND", "rules" => [], "valid" => true }),
"search" => "",
- "beta_analytics" => "false",
}, uri.query_values)
visit "/admin/#/stats/logs?search=&start_at=2015-01-13&end_at=2015-01-18&interval=day"
@@ -114,7 +111,6 @@ def test_csv_download_link_changes_with_filters
"end_at" => "2015-01-18",
"interval" => "day",
"query" => "",
- "beta_analytics" => "false",
}, uri.query_values)
end
@@ -173,22 +169,6 @@ def test_changing_intervals
assert_link("Download CSV", :href => /interval=minute/)
end
- def test_does_not_show_beta_analytics_toggle_by_default
- admin_login
- visit "/admin/#/stats/logs?search=&start_at=2015-01-12&end_at=2015-01-18&interval=day"
- assert_text("view top users")
- refute_text("Beta Analytics")
- end
-
- def test_shows_beta_analytics_toggle_when_enabled
- override_config({ "analytics" => { "outputs" => ["kylin"] } }, nil) do
- admin_login
- visit "/admin/#/stats/logs?search=&start_at=2015-01-12&end_at=2015-01-18&interval=day"
- assert_text("view top users")
- assert_text("Beta Analytics")
- end
- end
-
def test_date_range_picker
assert_date_range_picker("/stats/logs")
end
diff --git a/test/admin_ui/test_stats_map.rb b/test/admin_ui/test_stats_map.rb
index c3f4a4b3a..667513bc2 100644
--- a/test/admin_ui/test_stats_map.rb
+++ b/test/admin_ui/test_stats_map.rb
@@ -42,7 +42,6 @@ def test_csv_download
"search" => "",
"query" => default_query,
"region" => "world",
- "beta_analytics" => "false",
}, uri.query_values)
# Wait for the ajax actions to fetch the graph and tables to both
diff --git a/test/admin_ui/test_stats_users.rb b/test/admin_ui/test_stats_users.rb
index be2889ca5..a57855b12 100644
--- a/test/admin_ui/test_stats_users.rb
+++ b/test/admin_ui/test_stats_users.rb
@@ -70,7 +70,6 @@ def test_csv_download
"end_at" => "2015-01-18",
"search" => "",
"query" => default_query,
- "beta_analytics" => "false",
}, uri.query_values)
# Wait for the ajax actions to fetch the graph and tables to both
diff --git a/test/apis/admin/test_auth.rb b/test/apis/admin/test_auth.rb
index c77a6f8a8..527a3f532 100644
--- a/test/apis/admin/test_auth.rb
+++ b/test/apis/admin/test_auth.rb
@@ -36,7 +36,6 @@ def test_authenticated
"api_key",
"api_umbrella_version",
"authenticated",
- "enable_beta_analytics",
"local_auth_enabled",
"password_length_min",
"username_is_email",
@@ -48,7 +47,6 @@ def test_authenticated
assert_kind_of(String, data["api_key"])
assert_kind_of(String, data["api_umbrella_version"])
assert_includes([TrueClass, FalseClass], data["authenticated"].class)
- assert_includes([TrueClass, FalseClass], data["enable_beta_analytics"].class)
assert_equal([
"email",