Commit 9c124899 authored by Dmitry Gruzd's avatar Dmitry Gruzd Committed by Dylan Griffith

Switch to prefix code search

Currently the code search uses ngrams to allow searching for prefixes as
well as full matches. This takes up a lot of storage and can be replaced
with a prefix search.
This change removes the usage of edgeNGram_filter from our index
mappings.
parent 0f896bd4
......@@ -54,7 +54,7 @@ Please see the `sha_tokenizer` explanation later below for an example.
#### `code_analyzer`
Used when indexing a blob's filename and content. Uses the `whitespace` tokenizer and the filters: [`code`](#code), [`edgeNGram_filter`](#edgengram_filter), `lowercase`, and `asciifolding`
Used when indexing a blob's filename and content. Uses the `whitespace` tokenizer and the filters: [`code`](#code), `lowercase`, and `asciifolding`
The `whitespace` tokenizer was selected in order to have more control over how tokens are split. For example the string `Foo::bar(4)` needs to generate tokens like `Foo` and `bar(4)` in order to be properly searched.
......
---
title: Remove partial word matching from code search
merge_request: 32771
author:
type: changed
......@@ -35,7 +35,7 @@ module Elastic
code_analyzer: {
type: 'custom',
tokenizer: 'whitespace',
filter: %w(code edgeNGram_filter lowercase asciifolding)
filter: %w(code lowercase asciifolding)
},
code_search_analyzer: {
type: 'custom',
......@@ -60,11 +60,6 @@ module Elastic
'\.([^.]+)(?=\.|\s|\Z)', # separate terms on periods
'\/?([^\/]+)(?=\/|\b)' # separate path terms (like/this/one)
]
},
edgeNGram_filter: {
type: 'edgeNGram',
min_gram: 2,
max_gram: 40
}
},
tokenizer: {
......
......@@ -537,7 +537,15 @@ describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need_inlin
blobs = results.objects('blobs')
expect(blobs.first.data).to include('def')
expect(results.blobs_count).to eq 7
expect(results.blobs_count).to eq 5
end
it 'finds blobs by prefix search' do
results = described_class.new(user, 'defau*', limit_project_ids)
blobs = results.objects('blobs')
expect(blobs.first.data).to include('default')
expect(results.blobs_count).to eq 3
end
it 'finds blobs from public projects only' do
......@@ -547,13 +555,13 @@ describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need_inlin
ensure_elasticsearch_index!
results = described_class.new(user, 'def', [project_1.id])
expect(results.blobs_count).to eq 7
expect(results.blobs_count).to eq 5
result_project_ids = results.objects('blobs').map(&:project_id)
expect(result_project_ids.uniq).to eq([project_1.id])
results = described_class.new(user, 'def', [project_1.id, project_2.id])
expect(results.blobs_count).to eq 14
expect(results.blobs_count).to eq 10
end
it 'returns zero when blobs are not found' do
......@@ -580,7 +588,8 @@ describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need_inlin
expect(search_for('write')).to include('test.txt')
end
it 'find by first two words' do
# Re-enable after fixing https://gitlab.com/gitlab-org/gitlab/-/issues/10693#note_349683299
xit 'find by first two words' do
expect(search_for('writeString')).to include('test.txt')
end
......@@ -591,6 +600,10 @@ describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need_inlin
it 'find by exact match' do
expect(search_for('writeStringToFile')).to include('test.txt')
end
it 'find by prefix search' do
expect(search_for('writeStr*')).to include('test.txt')
end
end
context 'Searches special characters' do
......
......@@ -129,7 +129,7 @@ describe API::Search do
context 'filters' do
it 'by filename' do
get api("/projects/#{project.id}/search", user), params: { scope: 'blobs', search: 'mon filename:PROCESS.md' }
get api("/projects/#{project.id}/search", user), params: { scope: 'blobs', search: 'mon* filename:PROCESS.md' }
expect(response).to have_gitlab_http_status(:ok)
expect(json_response.size).to eq(1)
......@@ -137,7 +137,7 @@ describe API::Search do
end
it 'by path' do
get api("/projects/#{project.id}/search", user), params: { scope: 'blobs', search: 'mon path:markdown' }
get api("/projects/#{project.id}/search", user), params: { scope: 'blobs', search: 'mon* path:markdown' }
expect(response).to have_gitlab_http_status(:ok)
expect(json_response.size).to eq(1)
......@@ -147,7 +147,7 @@ describe API::Search do
end
it 'by extension' do
get api("/projects/#{project.id}/search", user), params: { scope: 'blobs', search: 'mon extension:md' }
get api("/projects/#{project.id}/search", user), params: { scope: 'blobs', search: 'mon* extension:md' }
expect(response).to have_gitlab_http_status(:ok)
expect(json_response.size).to eq(3)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment