Commit 335ee79a authored by Tiago Botelho's avatar Tiago Botelho

Refactors median code to work with both single and multiple projects

parent 3f31da9c
...@@ -7,8 +7,8 @@ class CycleAnalytics ...@@ -7,8 +7,8 @@ class CycleAnalytics
end end
def all_medians_per_stage def all_medians_per_stage
STAGES.each_with_object({}) do |stage_name, hsh| STAGES.each_with_object({}) do |stage_name, medians_per_stage|
hsh[stage_name] = self[stage_name].median medians_per_stage[stage_name] = self[stage_name].median
end end
end end
......
...@@ -7,6 +7,8 @@ class AnalyticsStageEntity < Grape::Entity ...@@ -7,6 +7,8 @@ class AnalyticsStageEntity < Grape::Entity
expose :description expose :description
expose :median, as: :value do |stage| expose :median, as: :value do |stage|
distance_of_time_in_words(stage.median) if stage.median && !(stage.median.blank? || stage.median.zero?) if stage.median && !(stage.median.nil? || stage.median.zero?)
distance_of_time_in_words(stage.median)
end
end end
end end
...@@ -8,14 +8,14 @@ module Gitlab ...@@ -8,14 +8,14 @@ module Gitlab
private private
def base_query def base_query
@base_query ||= stage_query([@project.id]) # rubocop:disable Gitlab/ModuleWithInstanceVariables @base_query ||= stage_query(@project.id) # rubocop:disable Gitlab/ModuleWithInstanceVariables
end end
def stage_query(project_ids) def stage_query(project_ids)
query = mr_closing_issues_table.join(issue_table).on(issue_table[:id].eq(mr_closing_issues_table[:issue_id])) query = mr_closing_issues_table.join(issue_table).on(issue_table[:id].eq(mr_closing_issues_table[:issue_id]))
.join(issue_metrics_table).on(issue_table[:id].eq(issue_metrics_table[:issue_id])) .join(issue_metrics_table).on(issue_table[:id].eq(issue_metrics_table[:issue_id]))
.project(issue_table[:project_id].as("project_id")) .project(issue_table[:project_id].as("project_id"))
.where(issue_table[:project_id].in(project_ids)) .where(issue_table[:project_id].in(Array(project_ids)))
.where(issue_table[:created_at].gteq(@options[:from])) # rubocop:disable Gitlab/ModuleWithInstanceVariables .where(issue_table[:created_at].gteq(@options[:from])) # rubocop:disable Gitlab/ModuleWithInstanceVariables
# Load merge_requests # Load merge_requests
......
...@@ -31,11 +31,15 @@ module Gitlab ...@@ -31,11 +31,15 @@ module Gitlab
interval_query = Arel::Nodes::As.new(cte_table, interval_query = Arel::Nodes::As.new(cte_table,
subtract_datetimes(stage_query(project_ids), start_time_attrs, end_time_attrs, name.to_s)) subtract_datetimes(stage_query(project_ids), start_time_attrs, end_time_attrs, name.to_s))
if project_ids.size == 1
loader.call(@project.id, median_datetime(cte_table, interval_query, name))
else
median_datetimes(cte_table, interval_query, name, :project_id)&.each do |project_id, median| median_datetimes(cte_table, interval_query, name, :project_id)&.each do |project_id, median|
loader.call(project_id, median) loader.call(project_id, median)
end end
end end
end end
end
def name def name
raise NotImplementedError.new("Expected #{self.name} to implement name") raise NotImplementedError.new("Expected #{self.name} to implement name")
......
...@@ -5,26 +5,20 @@ module Gitlab ...@@ -5,26 +5,20 @@ module Gitlab
attr_reader :projects, :options attr_reader :projects, :options
def initialize(projects, options) def initialize
@projects = projects @projects = Project.sorted_by_activity.limit(PROJECTS_LIMIT)
@options = options @options = { from: 7.days.ago }
end end
def to_json def to_json
total = 0 total = 0
values = {}
medians_per_stage.each do |stage_name, medians| values =
medians = medians.map(&:presence).compact medians_per_stage.each_with_object({}) do |(stage_name, medians), hsh|
calculations = stage_values(medians)
stage_values = {
average: calc_average(medians),
sd: standard_deviation(medians),
missing: projects.length - medians.length
}
total += stage_values.values.compact.sum total += calculations.values.compact.sum
values[stage_name] = stage_values hsh[stage_name] = calculations
end end
values[:total] = total values[:total] = total
...@@ -43,26 +37,36 @@ module Gitlab ...@@ -43,26 +37,36 @@ module Gitlab
end end
end end
def stage_values(medians)
medians = medians.map(&:presence).compact
average = calc_average(medians)
{
average: average,
sd: standard_deviation(medians, average),
missing: projects.length - medians.length
}
end
def calc_average(values) def calc_average(values)
return if values.empty? return if values.empty?
(values.sum / values.length).to_i (values.sum / values.length).to_i
end end
def sample_variance(values) def standard_deviation(values, average)
Math.sqrt(sample_variance(values, average)).to_i
end
def sample_variance(values, average)
return 0 if values.length <= 1 return 0 if values.length <= 1
avg = calc_average(values)
sum = values.inject(0) do |acc, val| sum = values.inject(0) do |acc, val|
acc + (val - avg)**2 acc + (val - average)**2
end end
sum / (values.length - 1) sum / (values.length - 1)
end end
def standard_deviation(values)
Math.sqrt(sample_variance(values)).to_i
end
end end
end end
end end
...@@ -2,32 +2,34 @@ ...@@ -2,32 +2,34 @@
module Gitlab module Gitlab
module Database module Database
module Median module Median
def median_datetimes(arel_table, query_so_far, column_sym, partition_column) def median_datetime(arel_table, query_so_far, column_sym)
median_queries = extract_median(execute_queries(arel_table, query_so_far, column_sym)).presence
if Gitlab::Database.postgresql?
pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column)
elsif Gitlab::Database.mysql?
mysql_median_datetime_sql(arel_table, query_so_far, column_sym)
end end
results = Array.wrap(median_queries).map do |query| def median_datetimes(arel_table, query_so_far, column_sym, partition_column)
ActiveRecord::Base.connection.execute(query) extract_medians(execute_queries(arel_table, query_so_far, column_sym, partition_column)).presence
end
extract_medians(results).presence
end end
def extract_medians(results) def extract_median(results)
result = results.compact.first result = results.compact.first
if Gitlab::Database.postgresql? if Gitlab::Database.postgresql?
result.values.map do |id, median| result = result.first.presence
[id.to_i, median&.to_f]
end.to_h result['median']&.to_f if result
elsif Gitlab::Database.mysql? elsif Gitlab::Database.mysql?
result.to_a.flatten.first result.to_a.flatten.first
end end
end end
def extract_medians(results)
return {} if Gitlab::Database.mysql?
results.compact.first.values.map do |id, median|
[id.to_i, median&.to_f]
end.to_h
end
def mysql_median_datetime_sql(arel_table, query_so_far, column_sym) def mysql_median_datetime_sql(arel_table, query_so_far, column_sym)
query = arel_table query = arel_table
.from(arel_table.project(Arel.sql('*')).order(arel_table[column_sym]).as(arel_table.table_name)) .from(arel_table.project(Arel.sql('*')).order(arel_table[column_sym]).as(arel_table.table_name))
...@@ -53,7 +55,7 @@ module Gitlab ...@@ -53,7 +55,7 @@ module Gitlab
] ]
end end
def pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column) def pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column = nil)
# Create a CTE with the column we're operating on, row number (after sorting by the column # Create a CTE with the column we're operating on, row number (after sorting by the column
# we're operating on), and count of the table we're operating on (duplicated across) all rows # we're operating on), and count of the table we're operating on (duplicated across) all rows
# of the CTE. For example, if we're looking to find the median of the `projects.star_count` # of the CTE. For example, if we're looking to find the median of the `projects.star_count`
...@@ -64,28 +66,31 @@ module Gitlab ...@@ -64,28 +66,31 @@ module Gitlab
# 5 | 1 | 3 # 5 | 1 | 3
# 9 | 2 | 3 # 9 | 2 | 3
# 15 | 3 | 3 # 15 | 3 | 3
#
# If a partition column is used we will do the same operation but for separate partitions,
# when that happens the CTE might look like this:
#
# project_id | star_count | row_id | ct
# ------------+------------+--------+----
# 1 | 5 | 1 | 2
# 1 | 9 | 2 | 2
# 2 | 10 | 1 | 3
# 2 | 15 | 2 | 3
# 2 | 20 | 3 | 3
cte_table = Arel::Table.new("ordered_records") cte_table = Arel::Table.new("ordered_records")
cte = Arel::Nodes::As.new( cte = Arel::Nodes::As.new(
cte_table, cte_table,
arel_table arel_table.project(*rank_rows(arel_table, column_sym, partition_column)).
.project(
arel_table[partition_column],
arel_table[column_sym].as(column_sym.to_s),
Arel::Nodes::Over.new(Arel::Nodes::NamedFunction.new("rank", []),
Arel::Nodes::Window.new.partition(arel_table[partition_column])
.order(arel_table[column_sym])).as('row_id'),
arel_table.from(arel_table.alias)
.project("COUNT(*)")
.where(arel_table[partition_column].eq(arel_table.alias[partition_column])).as('ct')).
# Disallow negative values # Disallow negative values
where(arel_table[column_sym].gteq(zero_interval))) where(arel_table[column_sym].gteq(zero_interval)))
# From the CTE, select either the middle row or the middle two rows (this is accomplished # From the CTE, select either the middle row or the middle two rows (this is accomplished
# by 'where cte.row_id between cte.ct / 2.0 AND cte.ct / 2.0 + 1'). Find the average of the # by 'where cte.row_id between cte.ct / 2.0 AND cte.ct / 2.0 + 1'). Find the average of the
# selected rows, and this is the median value. # selected rows, and this is the median value.
result =
cte_table cte_table
.project(cte_table[partition_column]) .project(*median_projections(cte_table, column_sym, partition_column))
.project(average([extract_epoch(cte_table[column_sym])], "median"))
.where( .where(
Arel::Nodes::Between.new( Arel::Nodes::Between.new(
cte_table[:row_id], cte_table[:row_id],
...@@ -96,17 +101,72 @@ module Gitlab ...@@ -96,17 +101,72 @@ module Gitlab
) )
) )
.with(query_so_far, cte) .with(query_so_far, cte)
.group(cte_table[partition_column])
.order(cte_table[partition_column]) result.group(cte_table[partition_column]).order(cte_table[partition_column]) if partition_column
.to_sql
result.to_sql
end end
private private
def median_queries(arel_table, query_so_far, column_sym, partition_column = nil)
if Gitlab::Database.postgresql?
pg_median_datetime_sql(arel_table, query_so_far, column_sym, partition_column)
elsif Gitlab::Database.mysql?
mysql_median_datetime_sql(arel_table, query_so_far, column_sym)
end
end
def execute_queries(arel_table, query_so_far, column_sym, partition_column = nil)
queries = median_queries(arel_table, query_so_far, column_sym, partition_column)
Array.wrap(queries).map { |query| ActiveRecord::Base.connection.execute(query) }
end
def average(args, as) def average(args, as)
Arel::Nodes::NamedFunction.new("AVG", args, as) Arel::Nodes::NamedFunction.new("AVG", args, as)
end end
def rank_rows(arel_table, column_sym, partition_column)
column_row = arel_table[column_sym].as(column_sym.to_s)
if partition_column
partition_row = arel_table[partition_column]
row_id =
Arel::Nodes::Over.new(
Arel::Nodes::NamedFunction.new('rank', []),
Arel::Nodes::Window.new.partition(arel_table[partition_column])
.order(arel_table[column_sym])
).as('row_id')
count = arel_table.from(arel_table.alias)
.project('COUNT(*)')
.where(arel_table[partition_column].eq(arel_table.alias[partition_column]))
.as('ct')
[partition_row, column_row, row_id, count]
else
row_id =
Arel::Nodes::Over.new(
Arel::Nodes::NamedFunction.new('row_number', []),
Arel::Nodes::Window.new.order(arel_table[column_sym])
).as('row_id')
count = arel_table.project("COUNT(1)").as('ct')
[column_row, row_id, count]
end
end
def median_projections(table, column_sym, partition_column)
if partition_column
[table[partition_column],
average([extract_epoch(table[column_sym])], "median")]
else
[average([extract_epoch(table[column_sym])], "median")]
end
end
def extract_epoch(arel_attribute) def extract_epoch(arel_attribute)
Arel.sql(%Q{EXTRACT(EPOCH FROM "#{arel_attribute.relation.name}"."#{arel_attribute.name}")}) Arel.sql(%Q{EXTRACT(EPOCH FROM "#{arel_attribute.relation.name}"."#{arel_attribute.name}")})
end end
......
...@@ -73,12 +73,7 @@ module Gitlab ...@@ -73,12 +73,7 @@ module Gitlab
end end
def cycle_analytics_usage_data def cycle_analytics_usage_data
# We only want to generate this data for instances that use PostgreSQL Gitlab::CycleAnalytics::UsageData.new.to_json
return {} if Gitlab::Database.mysql?
projects = Project.sorted_by_activity.limit(Gitlab::CycleAnalytics::UsageData::PROJECTS_LIMIT)
Gitlab::CycleAnalytics::UsageData.new(projects, { from: 7.days.ago }).to_json
end end
def features_usage_data def features_usage_data
......
...@@ -41,7 +41,7 @@ feature 'Cycle Analytics', :js do ...@@ -41,7 +41,7 @@ feature 'Cycle Analytics', :js do
allow_any_instance_of(Gitlab::ReferenceExtractor).to receive(:issues).and_return([issue]) allow_any_instance_of(Gitlab::ReferenceExtractor).to receive(:issues).and_return([issue])
project.add_master(user) project.add_master(user)
create_cycle @build = create_cycle(user, project, issue, mr, milestone, pipeline)
deploy_master deploy_master
sign_in(user) sign_in(user)
...@@ -117,7 +117,7 @@ feature 'Cycle Analytics', :js do ...@@ -117,7 +117,7 @@ feature 'Cycle Analytics', :js do
project.add_guest(guest) project.add_guest(guest)
allow_any_instance_of(Gitlab::ReferenceExtractor).to receive(:issues).and_return([issue]) allow_any_instance_of(Gitlab::ReferenceExtractor).to receive(:issues).and_return([issue])
create_cycle create_cycle(user, project, issue, mr, milestone, pipeline)
deploy_master deploy_master
sign_in(guest) sign_in(guest)
...@@ -166,16 +166,6 @@ feature 'Cycle Analytics', :js do ...@@ -166,16 +166,6 @@ feature 'Cycle Analytics', :js do
expect(find('.stage-events')).to have_content("!#{mr.iid}") expect(find('.stage-events')).to have_content("!#{mr.iid}")
end end
def create_cycle
issue.update(milestone: milestone)
pipeline.run
@build = create(:ci_build, pipeline: pipeline, status: :success, author: user)
merge_merge_requests_closing_issue(issue)
ProcessCommitWorker.new.perform(project.id, user.id, mr.commits.last.to_hash)
end
def click_stage(stage_name) def click_stage(stage_name)
find('.stage-nav li', text: stage_name).click find('.stage-nav li', text: stage_name).click
wait_for_requests wait_for_requests
......
require 'spec_helper'
describe Gitlab::CycleAnalytics::UsageData do
let(:project) { create(:project, :repository) }
let(:user) { create(:user, :admin) }
let(:issue) { create(:issue, project: project, created_at: 2.days.ago) }
let(:milestone) { create(:milestone, project: project) }
let(:mr) { create_merge_request_closing_issue(issue, commit_message: "References #{issue.to_reference}") }
let(:pipeline) { create(:ci_empty_pipeline, status: 'created', project: project, ref: mr.source_branch, sha: mr.source_branch_sha, head_pipeline_of: mr) }
subject { described_class.new([project]) }
describe '#to_json' do
before do
allow_any_instance_of(Gitlab::ReferenceExtractor).to receive(:issues).and_return([issue])
create_cycle(user, project, issue, mr, milestone, pipeline)
deploy_master
end
it 'returns the aggregated usage data of every selected project' do
result = subject.to_json
avg_cycle_analytics = result[:avg_cycle_analytics]
expect(result).to have_key(:avg_cycle_analytics)
CycleAnalytics::STAGES.each do |stage_name|
stage_values = avg_cycle_analytics[stage_name]
expect(avg_cycle_analytics).to have_key(stage_name)
expect(stage_values).to have_key(:average)
expect(stage_values).to have_key(:sd)
expect(stage_values).to have_key(:missing)
end
end
end
end
require 'spec_helper'
describe Gitlab::Database::Median do
describe '#extract_medians' do
context 'when using MySQL' do
it 'returns an empty hash' do
values = [["1", "1000"]]
allow(Gitlab::Database).to receive(:mysql?).and_return(true)
expect(described_class.new.extract_median(values)).eq({})
end
end
end
end
require 'spec_helper'
describe CycleAnalytics do
let(:project) { create(:project, :repository) }
let(:from_date) { 10.days.ago }
let(:user) { create(:user, :admin) }
let(:issue) { create(:issue, project: project, created_at: 2.days.ago) }
let(:milestone) { create(:milestone, project: project) }
let(:mr) { create_merge_request_closing_issue(issue, commit_message: "References #{issue.to_reference}") }
let(:pipeline) { create(:ci_empty_pipeline, status: 'created', project: project, ref: mr.source_branch, sha: mr.source_branch_sha, head_pipeline_of: mr) }
subject { described_class.new(project, from: from_date) }
describe '#all_medians_per_stage' do
before do
allow_any_instance_of(Gitlab::ReferenceExtractor).to receive(:issues).and_return([issue])
create_cycle(user, project, issue, mr, milestone, pipeline)
deploy_master
end
it 'returns every median for each stage for a specific project' do
values = described_class::STAGES.each_with_object({}) do |stage_name, hsh|
hsh[stage_name] = subject[stage_name].median.presence
end
expect(subject.all_medians_per_stage).to eq(values)
end
end
end
...@@ -26,6 +26,18 @@ module CycleAnalyticsHelpers ...@@ -26,6 +26,18 @@ module CycleAnalyticsHelpers
ref: 'refs/heads/master').execute ref: 'refs/heads/master').execute
end end
def create_cycle(user, project, issue, mr, milestone, pipeline)
issue.update(milestone: milestone)
pipeline.run
ci_build = create(:ci_build, pipeline: pipeline, status: :success, author: user)
merge_merge_requests_closing_issue(issue)
ProcessCommitWorker.new.perform(project.id, user.id, mr.commits.last.to_hash)
ci_build
end
def create_merge_request_closing_issue(issue, message: nil, source_branch: nil, commit_message: 'commit message') def create_merge_request_closing_issue(issue, message: nil, source_branch: nil, commit_message: 'commit message')
if !source_branch || project.repository.commit(source_branch).blank? if !source_branch || project.repository.commit(source_branch).blank?
source_branch = generate(:branch) source_branch = generate(:branch)
......
...@@ -50,7 +50,7 @@ module CycleAnalyticsHelpers ...@@ -50,7 +50,7 @@ module CycleAnalyticsHelpers
end end
median_time_difference = time_differences.sort[2] median_time_difference = time_differences.sort[2]
expect(subject[phase].median.presence).to be_within(5).of(median_time_difference) expect(subject[phase].median).to be_within(5).of(median_time_difference)
end end
context "when the data belongs to another project" do context "when the data belongs to another project" do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment