Commit a609fdea authored by Adam Hegyi's avatar Adam Hegyi

Sort results by similarity in group project API

This change introduces similarity search based on the incoming search
parameter for the group projects api endpoint.
parent 59c288db
......@@ -451,6 +451,16 @@ class Project < ApplicationRecord
# Sometimes queries (e.g. using CTEs) require explicit disambiguation with table name
scope :projects_order_id_desc, -> { reorder(self.arel_table['id'].desc) }
scope :sorted_by_similarity_desc, -> (search) do
order_expression = Gitlab::Database::SimilarityScore.build_expression(search: search, rules: [
{ column: arel_table["path"], multiplier: 1 },
{ column: arel_table["name"], multiplier: 0.7 },
{ column: arel_table["description"], multiplier: 0.2 }
])
reorder(order_expression.desc, arel_table['id'].desc)
end
scope :with_packages, -> { joins(:packages) }
scope :in_namespace, ->(namespace_ids) { where(namespace_id: namespace_ids) }
scope :personal, ->(user) { where(namespace_id: user.namespace_id) }
......
......@@ -184,7 +184,7 @@ Parameters:
| `id` | integer/string | yes | The ID or [URL-encoded path of the group](README.md#namespaced-path-encoding) owned by the authenticated user |
| `archived` | boolean | no | Limit by archived status |
| `visibility` | string | no | Limit by visibility `public`, `internal`, or `private` |
| `order_by` | string | no | Return projects ordered by `id`, `name`, `path`, `created_at`, `updated_at`, or `last_activity_at` fields. Default is `created_at` |
| `order_by` | string | no | Return projects ordered by `id`, `name`, `path`, `created_at`, `updated_at`, `similarity` (1), or `last_activity_at` fields. Default is `created_at` |
| `sort` | string | no | Return projects sorted in `asc` or `desc` order. Default is `desc` |
| `search` | string | no | Return list of authorized projects matching the search criteria |
| `simple` | boolean | no | Return only the ID, URL, name, and path of each project |
......@@ -198,6 +198,13 @@ Parameters:
| `with_custom_attributes` | boolean | no | Include [custom attributes](custom_attributes.md) in response (admins only) |
| `with_security_reports` | boolean | no | **(ULTIMATE)** Return only projects that have security reports artifacts present in any of their builds. This means "projects with security reports enabled". Default is `false` |
1. Order by similarity: Orders the results by a similarity score calculated from the provided `search`
URL parameter. This is an [alpha](https://about.gitlab.com/handbook/product/gitlab-the-product/#alpha) feature [introduced](https://gitlab.com/gitlab-org/gitlab/-/issues/221043) in GitLab 13.3.
The feature is behind a feature flag, you can [enable it](../administration/feature_flags.md#enable-or-disable-the-feature)
with the `similarity_search` flag. When using `order_by=similarity` the `sort` parameter is
ignored. When the `search` parameter is not provided, the API returns the projects ordered by `name`.
Example response:
```json
......
......@@ -76,7 +76,7 @@ module API
params: project_finder_params,
options: finder_options
).execute
projects = reorder_projects(projects)
projects = reorder_projects_with_similarity_order_support(group, projects)
paginate(projects)
end
......@@ -112,6 +112,24 @@ module API
accepted!
end
def reorder_projects_with_similarity_order_support(group, projects)
return handle_similarity_order(group, projects) if params[:order_by] == 'similarity'
reorder_projects(projects)
end
# rubocop: disable CodeReuse/ActiveRecord
def handle_similarity_order(group, projects)
if params[:search].present? && Feature.enabled?(:similarity_search, group)
projects.sorted_by_similarity_desc(params[:search])
else
order_options = { name: :asc }
order_options['id'] ||= params[:sort] || 'asc'
projects.reorder(order_options)
end
end
# rubocop: enable CodeReuse/ActiveRecord
end
resource :groups do
......@@ -222,7 +240,7 @@ module API
optional :visibility, type: String, values: Gitlab::VisibilityLevel.string_values,
desc: 'Limit by visibility'
optional :search, type: String, desc: 'Return list of authorized projects matching the search criteria'
optional :order_by, type: String, values: %w[id name path created_at updated_at last_activity_at],
optional :order_by, type: String, values: %w[id name path created_at updated_at last_activity_at similarity],
default: 'created_at', desc: 'Return projects ordered by field'
optional :sort, type: String, values: %w[asc desc], default: 'desc',
desc: 'Return projects sorted in ascending and descending order'
......
# frozen_string_literal: true
module Gitlab
module Database
class SimilarityScore
EMPTY_STRING = Arel.sql("''").freeze
EXPRESSION_ON_INVALID_INPUT = Arel::Nodes::NamedFunction.new('CAST', [Arel.sql('0').as('integer')]).freeze
DEFAULT_MULTIPLIER = 1
# This method returns an Arel expression that can be used in an ActiveRecord query to order the resultset by similarity.
#
# Note: Calculating similarity score for large volume of records is inefficient. use SimilarityScore only for smaller
# resultset which is already filtered by other conditions (< 10_000 records).
#
# ==== Parameters
# * +search+ - [String] the user provided search string
# * +rules+ - [{ column: COLUMN, multiplier: 1 }, { column: COLUMN_2, multiplier: 0.5 }] rules for the scoring.
# * +column+ - Arel column expression, example: Project.arel_table["name"]
# * +multiplier+ - Integer or Float to increase or decrease the score (optional, defaults to 1)
#
# ==== Use case
#
# We'd like to search for projects by path, name and description. We want to rank higher the path and name matches, since
# it's more likely that the user was remembering the path or the name of the project.
#
# Rules:
# [
# { column: Project.arel_table['path'], multiplier: 1 },
# { column: Project.arel_table['name'], multiplier: 1 },
# { column: Project.arel_table['description'], multiplier: 0.5 }
# ]
#
# ==== Examples
#
# Similarity calculation based on one column:
#
# Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [{ column: Project.arel_table['name'] }])
#
# Similarity calculation based on two column, where the second column has lower priority:
#
# Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [
# { column: Project.arel_table['name'], multiplier: 1 },
# { column: Project.arel_table['description'], multiplier: 0.5 }
# ])
#
# Integration with an ActiveRecord query:
#
# table = Project.arel_table
#
# order_expression = Gitlab::Database::SimilarityScore.build_expession(search: 'input', rules: [
# { column: table['name'], multiplier: 1 },
# { column: table['description'], multiplier: 0.5 }
# ])
#
# Project.where("name LIKE ?", '%' + 'input' + '%').order(order_expression.desc)
#
# The expression can be also used in SELECT:
#
# results = Project.select(order_expression.as('similarity')).where("name LIKE ?", '%' + 'input' + '%').order(similarity: :desc)
# puts results.map(&:similarity)
#
def self.build_expression(search:, rules:)
return EXPRESSION_ON_INVALID_INPUT if search.blank? || rules.empty?
quoted_search = ActiveRecord::Base.connection.quote(search.to_s)
first_expression, *expressions = rules.map do |rule|
rule_to_arel(quoted_search, rule)
end
# (SIMILARITY ...) + (SIMILARITY ...)
expressions.inject(first_expression) do |expression1, expression2|
Arel::Nodes::Addition.new(expression1, expression2)
end
end
# (SIMILARITY(COALESCE(column, ''), 'search_string') * CAST(multiplier AS numeric))
def self.rule_to_arel(search, rule)
Arel::Nodes::Grouping.new(
Arel::Nodes::Multiplication.new(
similarity_function_call(search, column_expression(rule)),
multiplier_expression(rule)
)
)
end
# COALESCE(column, '')
def self.column_expression(rule)
Arel::Nodes::NamedFunction.new('COALESCE', [rule.fetch(:column), EMPTY_STRING])
end
# SIMILARITY(COALESCE(column, ''), 'search_string')
def self.similarity_function_call(search, column)
Arel::Nodes::NamedFunction.new('SIMILARITY', [column, Arel.sql(search)])
end
# CAST(multiplier AS numeric)
def self.multiplier_expression(rule)
quoted_multiplier = ActiveRecord::Base.connection.quote(rule.fetch(:multiplier, DEFAULT_MULTIPLIER).to_s)
Arel::Nodes::NamedFunction.new('CAST', [Arel.sql(quoted_multiplier).as('numeric')])
end
private_class_method :rule_to_arel
private_class_method :column_expression
private_class_method :similarity_function_call
private_class_method :multiplier_expression
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Database::SimilarityScore do
let(:search) { '' }
let(:query_result) { ActiveRecord::Base.connection.execute(query).to_a }
let(:query) do
# In memory query, with the id as the tie breaker.
<<-SQL
SELECT *, #{order_expression} AS similarity
FROM (
VALUES (1, 'Git', 'git', 'git source code mirror. this is a publish-only repository.'),
(2, 'GitLab Runner', 'gitlab-runner', 'official helm chart for the gitlab runner'),
(3, 'gitaly', 'gitaly', 'gitaly is a git rpc service for handling all the git calls made by gitlab'),
(4, 'GitLab', 'gitlab', 'gitlab is an open source end-to-end software development platform with built-in version control'),
(5, 'Gitlab Danger', 'gitlab-danger', 'this gem provides common dangerfile and plugins for gitlab projects'),
(6, 'different', 'same', 'same'),
(7, 'same', 'different', 'same'),
(8, 'gitlab-styles', 'gitlab-styles', 'gitlab style guides and shared style configs.'),
(9, '🔒 gitaly', 'gitaly-sec', 'security mirror for gitaly')
) tbl (id, name, path, descrption) ORDER BY #{order_expression} DESC, id DESC;
SQL
end
let(:order_expression) do
Gitlab::Database::SimilarityScore.build_expression(search: search, rules: [{ column: Arel.sql('path') }]).to_sql
end
subject { query_result.take(3).map { |row| row['path'] } }
context 'when passing empty values' do
context 'when search is nil' do
let(:search) { nil }
it 'orders by a constant 0 value' do
expect(query).to include('ORDER BY CAST(0 AS integer) DESC')
end
end
context 'when rules are empty' do
let(:search) { 'text' }
let(:order_expression) do
Gitlab::Database::SimilarityScore.build_expression(search: search, rules: []).to_sql
end
it 'orders by a constant 0 value' do
expect(query).to include('ORDER BY CAST(0 AS integer) DESC')
end
end
end
context 'when similarity scoring based on the path' do
let(:search) { 'git' }
context 'when searching for `git`' do
let(:search) { 'git' }
it { expect(subject).to eq(%w[git gitlab gitaly]) }
end
context 'when searching for `gitlab`' do
let(:search) { 'gitlab' }
it { expect(subject).to eq(%w[gitlab gitlab-styles gitlab-danger]) }
end
context 'when searching for something unrelated' do
let(:search) { 'xyz' }
it 'results have 0 similarity score' do
expect(query_result.map { |row| row['similarity'] }).to all(eq(0))
end
end
end
describe 'score multiplier' do
let(:order_expression) do
Gitlab::Database::SimilarityScore.build_expression(search: search, rules: [
{ column: Arel.sql('path'), multiplier: 1 },
{ column: Arel.sql('name'), multiplier: 0.8 }
]).to_sql
end
let(:search) { 'different' }
it 'ranks `path` matches higher' do
expect(subject).to eq(%w[different same gitlab-danger])
end
end
end
......@@ -860,6 +860,66 @@ RSpec.describe API::Groups do
end
end
context 'with similarity ordering' do
let_it_be(:group_with_projects) { create(:group) }
let_it_be(:project_1) { create(:project, name: 'Project', path: 'project', group: group_with_projects) }
let_it_be(:project_2) { create(:project, name: 'Test Project', path: 'test-project', group: group_with_projects) }
let_it_be(:project_3) { create(:project, name: 'Test', path: 'test', group: group_with_projects) }
let(:params) { { order_by: 'similarity', search: 'test' } }
subject { get api("/groups/#{group_with_projects.id}/projects", user1), params: params }
before do
group_with_projects.add_owner(user1)
end
it 'returns items based ordered by similarity' do
subject
expect(response).to have_gitlab_http_status(:ok)
expect(response).to include_pagination_headers
expect(json_response.length).to eq(2)
project_names = json_response.map { |proj| proj['name'] }
expect(project_names).to eq(['Test', 'Test Project'])
end
context 'when `search` parameter is not given' do
before do
params.delete(:search)
end
it 'returns items ordered by name' do
subject
expect(response).to have_gitlab_http_status(:ok)
expect(response).to include_pagination_headers
expect(json_response.length).to eq(3)
project_names = json_response.map { |proj| proj['name'] }
expect(project_names).to eq(['Project', 'Test', 'Test Project'])
end
end
context 'when `similarity_search` feature flag is off' do
before do
stub_feature_flags(similarity_search: false)
end
it 'returns items ordered by name' do
subject
expect(response).to have_gitlab_http_status(:ok)
expect(response).to include_pagination_headers
expect(json_response.length).to eq(2)
project_names = json_response.map { |proj| proj['name'] }
expect(project_names).to eq(['Test', 'Test Project'])
end
end
end
it "returns the group's projects with simple representation" do
get api("/groups/#{group1.id}/projects", user1), params: { simple: true }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment