Solution to Genomic-Range-Query by codility

21 Jan

January 21, 2014 Sheng 61

Question: http://codility.com/demo/take-sample-test/genomic_range_query

Question Name: GenomicRangeQuery

This is a typical case that uses more space for less time.

def solution(S, P, Q):

result = []

DNA_len = len(S)

mapping = {"A":1, "C":2, "G":3, "T":4}

# next_nucl is used to store the position information

# next_nucl[0] is about the "A" nucleotides, [1] about "C"

# [2] about "G", and [3] about "T"

# next_nucl[i][j] = k means: for the corresponding nucleotides i,

# at position j, the next corresponding nucleotides appears

# at position k (including j)

# k == -1 means: the next corresponding nucleotides does not exist

next_nucl = [[-1]*DNA_len, [-1]*DNA_len, [-1]*DNA_len, [-1]*DNA_len]

# Scan the whole DNA sequence, and retrieve the position information

next_nucl[mapping[S[-1]] - 1][-1] = DNA_len-1

for index in range(DNA_len-2,-1,-1):

next_nucl[0][index] = next_nucl[0][index+1]

next_nucl[1][index] = next_nucl[1][index+1]

next_nucl[2][index] = next_nucl[2][index+1]

next_nucl[3][index] = next_nucl[3][index+1]

next_nucl[mapping[S[index]] - 1][index] = index

for index in range(0,len(P)):

if next_nucl[0][P[index]] != -1 and next_nucl[0][P[index]] <= Q[index]:

result.append(1)

elif next_nucl[1][P[index]] != -1 and next_nucl[1][P[index]] <= Q[index]:

result.append(2)

elif next_nucl[2][P[index]] != -1 and next_nucl[2][P[index]] <= Q[index]:

result.append(3)

else:

result.append(4)

return result

61 Replies to “Solution to Genomic-Range-Query by codility”

Alessandro says:
February 13, 2014 at 7:57 am
In this one I implemented a Segmented Tree, which was aaaa looooootttt more code than yours @o@.
Reply
- Sheng says:
  February 13, 2014 at 3:54 pm
  Segmented Tree is a more powerful and more universal solution for this kind of questions. The major reason, that I did not use it here, is: I did not master it. If you are willing, could you please share your code here, as another solution?
  Reply
- YC Chiranjeevi says:
  June 14, 2018 at 2:16 am
  Hey Allesandro, could you please post your code, if you are ok to share.
  Reply
Alessandro says:
February 13, 2014 at 7:58 am
(by the way I Subscribed to your blog, cool posts for those interested in programming challenges!)
Reply
- Sheng says:
  February 13, 2014 at 4:21 pm
  Thanks a lot! These questions should be also helpful to the internship/job seekers, like me~
  Reply

Here is my solution. Python, result 100/100. A bit different approach than yours, closer to Lesson 3 topic (Prefix Sums).

def prefix_sums(A, mapping):

n = len(A)

sums = [[0] * 4 for i in xrange(n + 1)]

for k in xrange(1, n + 1):

sums[k] = sums[k - 1][:]

sums[k][mapping[A[k - 1]] - 1] += 1

return sums

def get_slice_sum(Qi, Pi):

slice_sum = [0] * len(Qi)

for i in xrange(len(Qi)):

slice_sum[i] = Qi[i] - Pi[i]

return slice_sum

def solution(S, P, Q):

mapping = {'A':1, 'C':2, 'G':3, 'T':4}

sums = prefix_sums(S, mapping)

result = [0] * len(P)

for i in xrange(len(P)):

slice_sum = get_slice_sum(sums[Q[i] + 1], sums[P[i]])

if slice_sum[0] != 0:

result[i] = 1

elif slice_sum[1] != 0:

result[i] = 2

elif slice_sum[2] != 0:

result[i] = 3

else:

result[i] = 4

return result

Sorry for bad tag usage in my previous post attempt 🙂

Sheng says:
February 27, 2015 at 12:19 am
Thanks for sharing! I removed your previous mis-formatted comment to keep clean.
Reply

Great code!! Just making it a little more verbose without a dictionary for expressiveness :

def solution(S, P, Q):

n = len(S)

sumA = [0]*(n+1)

sumC = [0]*(n+1)

sumG = [0]*(n+1)

sumT = [0]*(n+1)

for idx, nucleotide in enumerate(S):

sumA[idx+1] = sumA[idx]

sumC[idx+1] = sumC[idx]

sumT[idx+1] = sumT[idx]

sumG[idx+1] = sumG[idx]

if nucleotide == 'A':

sumA[idx+1] += 1

elif nucleotide == 'C':

sumC[idx+1] += 1

elif nucleotide == 'G':

sumG[idx+1] += 1

else:

sumT[idx+1] += 1

result = [0]*len(P)

for i in range(len(P)):

AsInRange = sumA[Q[i] + 1]- sumA[P[i]]

CsInRange = sumC[Q[i] + 1]- sumC[P[i]]

GsInRange = sumG[Q[i] + 1]- sumG[P[i]]

#TsInRange = sumT[Q[i] + 1]- sumT[P[i]]

if AsInRange > 0:

result[i] = 1

elif CsInRange > 0:

result[i] = 2

elif GsInRange > 0:

result[i] = 3

else:

result[i] = 4

return result

Sheng says:
August 14, 2015 at 12:21 am
Great! Thanks!
Reply

Solution in Golang without going to a map to convert nucleotide to number.

package solution

// you can also use imports, for example:

// import "fmt"

// you can use fmt.Println for debugging purposes, e.g.

// fmt.Println("this is a debug message")

type nucleotideSummations []nucleotides

func (n nucleotideSummations) exists(i, j, nucleotide int) bool {

i--

var start int

if i < 0 {

start = 0

} else {

start = n[i][nucleotide]

}

return n[j][nucleotide]-start > 0

}

type nucleotides []int

func populateSummations(s string) nucleotideSummations {

ns := make(nucleotideSummations, 0, len(s))

for i, c := range s {

n := make(nucleotides, 4)

if i > 0 {

copy(n, ns[i-1])

}

switch c {

case 'A':

n[0]++

case 'C':

n[1]++

case 'G':

n[2]++

default:

n[3]++

}

ns = append(ns, n)

}

return ns

}

func Solution(S string, P []int, Q []int) []int {

ns := populateSummations(S)

result := make([]int, 0, len(P))

length := len(P)

for i := 0; i < length; i++ {

start, stop := P[i], Q[i]

for j := 0; j < 4; j++ {

if ns.exists(start, stop, j) {

result = append(result, j+1)

break

}

return result

}

Sheng says:
April 5, 2015 at 10:23 pm
Thanks for sharing!
Reply

20150525 says:
May 25, 2015 at 4:09 am
Why don’t set the initialize value of next_nucl to DNA_len instead of -1?
Reply
- Sheng says:
  May 25, 2015 at 10:37 pm
  Yes, you can. It is more traditional and better to use DNA_len as the not-exist-item-index.
  Reply

impact = {

'A' : 1,

'C' : 2,

'G' : 3,

'T' : 4

}

def get_min(counts, p, q):

for key in 'A', 'C', 'G', 'T':

p_count = counts[key][p-1] if p > 0 else 0

if (counts[key][q] - p_count > 0):

return impact[key]

def solution(S, P, Q):

N = len(S)

counts = dict(

A = [0] * N,

C = [0] * N,

G = [0] * N,

T = [0] * N)

for i in xrange(N):

for key in counts:

counts[key][i] = counts[key][i-1]

counts[S[i]][i] += 1

return [get_min(counts, p, q) for p, q in zip(P,Q)]

Trying to understand the time complexity problem here. This is apparently O(N*M) instead of the required O(N+M). Can someone explain why?

def solution(S, P, Q):

impact_value_dict = {'A':1,'C':2,'G':3,'T':4}

nucleotides = [0]*len(P)

for x in xrange(0,len(P)):

nucleotides[x] = sorted(list(set(S[P[x]:Q[x]+1])))[0]

return [impact_value_dict.get(n) for n in nucleotides]

Sheng says:
July 19, 2015 at 12:39 am
The for loop itself is O(M). And inside the loop, the sorted statement is O(NlogN). Totally, it is O(M*N*logN).
Reply

C# solution – similar approach

using System;

class Solution {

private int GetVal(char c)

{

switch(c)

{

case 'A': return 0;

case 'C': return 1;

case 'G': return 2;

default: return 3;

}

public int[] solution(string S, int[] P, int[] Q) {

var counts = new int[4, S.Length];

for(int i = 0; i < S.Length; i++)

{

var val = GetVal(S[i]);

for (int j = 0; j < 4; j++)

counts[j, i] = (j == val ? 1 : 0) + (i == 0 ? 0 : counts[j, i - 1]);

}

var result = new int[P.Length];

for (int i = 0; i < P.Length; i++)

{

var p = P[i];

var q = Q[i];

for (int j = 0; j < 4; j++)

{

if (counts[j, q] - (p == 0 ? 0 : counts[j, p-1]) > 0)

{

result[i] = j + 1;

break;

}

return result;

}

Adding a Java Solution 100/100, https://codility.com/demo/results/demoJFB5EV-EG8/

class Solution {

private ImpactFactorHolder[] mHolder;

private static final int A=0,C=1,G=2,T=3;

public int[] solution(String S, int[] P, int[] Q) {

mHolder = createImpactHolderArray(S);

int queriesLength = P.length;

int[] result = new int[queriesLength];

for (int i = 0; i < queriesLength; ++i ) {

int value = 0;

if( P[i] == Q[i]) {

value = lookupValueForIndex(S.charAt(P[i])) + 1;

} else {

value = calculateMinImpactFactor(P[i], Q[i]);

}

result[i] = value;

}

return result;

}

public int calculateMinImpactFactor(int P, int Q) {

int minImpactFactor = 3;

for (int nucleotide = A; nucleotide <= T; ++nucleotide ) {

int qValue = mHolder[nucleotide].mOcurrencesSum[Q];

int pValue = mHolder[nucleotide].mOcurrencesSum[P];

// handling special cases when the less value is assigned on the P index

if( P-1 >= 0 ) {

pValue = mHolder[nucleotide].mOcurrencesSum[P-1] == 0 ? 0 : pValue;

} else if ( P == 0 ) {

pValue = mHolder[nucleotide].mOcurrencesSum[P] == 1 ? 0 : pValue;

}

if ( qValue - pValue > 0) {

minImpactFactor = nucleotide;

break;

}

return minImpactFactor + 1;

}

public int lookupValueForIndex(char nucleotide) {

int value = 0;

switch (nucleotide) {

case 'A' :

value = A;

break;

case 'C' :

value = C;

break;

case 'G':

value = G;

break;

case 'T':

value = T;

break;

default:

break;

}

return value;

}

public ImpactFactorHolder[] createImpactHolderArray(String S) {

int length = S.length();

ImpactFactorHolder[] holder = new ImpactFactorHolder[4];

holder[A] = new ImpactFactorHolder(1,'A', length);

holder[C] = new ImpactFactorHolder(2,'C', length);

holder[G] = new ImpactFactorHolder(3,'G', length);

holder[T] = new ImpactFactorHolder(4,'T', length);

int i =0;

for(char c : S.toCharArray()) {

int nucleotide = lookupValueForIndex(c);

++holder[nucleotide].mAcum;

holder[nucleotide].mOcurrencesSum[i] = holder[nucleotide].mAcum;

holder[A].mOcurrencesSum[i] = holder[A].mAcum;

holder[C].mOcurrencesSum[i] = holder[C].mAcum;

holder[G].mOcurrencesSum[i] = holder[G].mAcum;

holder[T].mOcurrencesSum[i] = holder[T].mAcum;

++i;

}

return holder;

}

private static class ImpactFactorHolder {

public ImpactFactorHolder(int impactFactor, char nucleotide, int length) {

mImpactFactor = impactFactor;

mNucleotide = nucleotide;

mOcurrencesSum = new int[length];

mAcum = 0;

}

int mImpactFactor;

char mNucleotide;

int[] mOcurrencesSum;

int mAcum;

}

I know the ImpactFactorHolder class might be too much, but It was a little bit tricky for me on what was happening, my initial attempts I used a two dimensions arrays, but somehow I wasn’t able to deal with some of the test cases and adding that extra class helped me a little bit to understand and solve the problem.

I took the liberty of Implementing the original solution in a more pythonic way, using dictionaries, removing the “-1” trick, using reversed(), enumerate() and zip() instead of looping with an index

def solution(S, P, Q):

result = []

nDNA = len(S)

# next_occurrence is used to store the position of the next occurrence of each letter

# Example:

# next_occurrence['A'][j] = k means:

# from position j included, the next nucleotide A appears

# at position k (including j)

# k == nDNA+9999 means: there is no occurrence of A in S after j (j included)

next_occurrence = {letter:[nDNA+9999] * nDNA for letter in 'ACGT'}

next_occurrence[S[-1]][-1] = nDNA-1

# Scan the whole DNA sequence

# For each position, store the index of next occurrence of each letter

for cursor, nucleotide in reversed(list(enumerate(S[:-1]))):

# By default, given a letter L, next occurrence of L from the cursor is the same as from cursor+1

for letter in 'ACGT':

next_occurrence[letter][cursor] = next_occurrence[letter][cursor+1]

# Except if S[cursor] = L. Then the next occurrence is cursor itself.

next_occurrence[nucleotide][cursor] = cursor

# Scan the requests

# For each segment [p, q], ...

for p, q in zip(P, Q):

# ...check if the next occurrence of A after p is before q

if next_occurrence['A'][p] <= q:

result.append(1)

# ...check if the next occurrence of C after p is before q

elif next_occurrence['C'][p] <= q:

result.append(2)

# ...check if the next occurrence of G after p is before q

elif next_occurrence['G'][p] <= q:

result.append(3)

# ...if none of the above, p must be a T followed only with more T's

else:

result.append(4)

return result

Son Thai says:
October 11, 2015 at 12:12 am
I had an idea but could not implement it due to practical number range limits. Small numbers may work.
Instead of impact 1,2,3,4; encode them as primes 2,3,5,7.
Instead of prefix-summing impacts, prefix multiplying the primes (equivalent to summing logs of primes).
Instead of subtracting prefix sums, use division.
For each segment, the division result is tested for divisibility of 2, then 3, then 5, then 7. If divisible, decode back to the minimum impact 1,2,3,4 correspondingly.
Reply
- Sheng says:
  October 17, 2015 at 12:34 am
  Yes, if they are short and the product is small, it works.
  Reply

Hi,
I am trying out Codility exercises to help me learn coding as I am preparing for interviews. Many thanks to Sheng, and other contributors, for the enormous help he is providing to people like me who is trying to learn on our own.
For this problem, I cooked up this solution in Python but I am not sure if it matches all the time and space restrictions of the problem. Any help would be appreciated a lot. Thanks in advance.

def genomicRangeQuery(S,P,Q):

# Lengths of P AND Q are same = M

# Length of S = N

# impact is a dictionary to map the values TO integer

impact = {'A':1, 'C': 2, 'G':3, 'T':4 }

M = len(P)

#Convert S, which is a string of the type 'ATCGGCAT' to integers

SS = [0]*len(S)

for i,el in enumerate(S):

SS[i] = impact[el]

#this array keeps the minimum impact factor for each (P[i],Q[i]) pair

min_factor = [0]*M

for i in xrange(M):

min_factor[i] = min(set(SS[P[i]:(Q[i]+1)]))

return min_factor

Sheng says:
October 30, 2015 at 12:00 am
Hi, thanks for visiting my blog! Unfortunately your solution does not match the time restrictions of the problem. Let the N be the length of the DNA S, and M be the length of queries (P and Q), the statement “min(set(SS[P[i]:(Q[i]+1)]))” is O(N), and it is inside one loop of O(M). Therefore, the total complexity is O(M*N).
Reply

Here’s a 100% score solution I did straight from codility in java. It’s definitely a hack’n’slash patch job to deal with some of the test cases, but I honestly didn’t understand some of the logic that others were employing.(Especially Luciano Issoe’s InRange variables in his last FOR loop. A quick explanation would be great =D)
Anyhow, hope this can help!

class Solution {

public int[] solution(String S, int[] P, int[] Q) {

int len = S.length();

int[] Acount = new int[len];

int [] Ccount = new int[len];

int [] Gcount = new int[len];

int [] Tcount = new int[len];

if (S.charAt(0) == 'A')

Acount[0]++;

else if (S.charAt(0) == 'C')

Ccount[0]++;

else if (S.charAt(0) == 'G')

Gcount[0]++;

else if (S.charAt(0) == 'T')

Tcount[0]++;

for (int i=1;i<len;i++){

Acount[i] = Acount[i-1];

Ccount[i] = Ccount[i-1];

Gcount[i] = Gcount[i-1];

Tcount[i] = Tcount[i-1];

if (S.charAt(i) == 'A')

Acount[i] ++;

else if (S.charAt(i) == 'C')

Ccount[i] ++;

else if (S.charAt(i) == 'G')

Gcount[i] ++;

else if (S.charAt(i) == 'T')

Tcount[i] ++;

}

int [] sol = new int[P.length];

for (int i=0;i<P.length;i++){

int start = P[i];

int end = Q[i];

char c = S.charAt(start);

if ( ((Acount[end] - Acount[start]) > 0) || c == 'A' )

sol[i] = 1;

else if ( (Ccount[end] - Ccount[start]) > 0 || c == 'C')

sol[i] = 2;

else if ( (Gcount[end] - Gcount[start]) > 0 || c == 'G')

sol[i] = 3;

else

sol[i] = 4;

}

return sol;

}

Tim Nguyen says:
December 23, 2015 at 3:27 pm
Hi Sheng,
This is a really great solution. How did you come up with this approach? Like is there any tricks or tips to know how I should approach the solution for this problem? I gave up after 3hrs solving since my approaches were wrong. I just want to know how you figured this problem out. Like ways to think and steps to approach the solution.
Thanks 🙂
Reply
- Sheng says:
  January 1, 2016 at 7:25 pm
  Hi Tim. Sorry, nothing but practice more 🙂
  Reply

Hello, here’s something I came up with in C, since there aren’t many C solutions I thought it could help. Seems kinda long and probably could use some optimisation.
Feel free to comment!

// you can write to stdout for debugging purposes, e.g.

// printf("this is a debug messagen");

struct Results solution(char *S, int P[], int Q[], int M) {

struct Results result;

// write your code in C99 (gcc 4.8.2)

result.A=(int*)malloc(sizeof(int)*M);

int a=0;

int b = 0;

int c =0;

int d =0;

int counter=0;

for (int i =0 ; i<M ; i++){

result.A[i]=4;

}

int temp [4] [100000];

for(int i =0; i<4 ; i++){

for (int j =0 ; j<100000 ; j++){

temp [i] [j] = -1;

}

while (*(S+counter)!=''){

switch (*(S+counter)){

case 'A':{

for(int i =a ; i<=counter ; i++){

if (a!=temp[0][i]){

temp[0] [i]=counter;

a=counter;

}

else{

a++;

}

case 'C':{

for(int i =b ; i<=counter ; i++){

if (b!=temp[1][i]){

temp[1] [i]=counter;

b=counter;

}

else{

b++;

}

case 'G':{

for(int i =c ; i<=counter ; i++){

if (c!=temp[2][i]){

temp[2] [i]=counter;

c=counter;

}

else{

c++;

}

case 'T':{

for(int i =d ; i<=counter ; i++){

if (d!=temp[3][i]){

temp[3] [i]=counter;

d=counter;

}

else{

d++;

}

counter++;

}

for (int i=0 ; i<M ; i++){

if (temp[0] [P[i]]!=-1&&Q[i]>=temp[0] [P[i]]){

result.A[i]=1;

}

else if (temp[1] [P[i]]!=-1 && Q[i]>=temp[1] [P[i]]){

result.A[i]=2;

}

else if (temp[2] [P[i]]!=-1 && Q[i]>=temp[2] [P[i]]){

result.A[i]=3;

}

else if(temp[3] [P[i]]!=-1 && Q[i]>=temp[3] [P[i]]){

result.A[i]=4;

}

result.M = M;

return result;

}

Sheng says:
April 4, 2016 at 7:44 pm
Thanks for C !
Reply

I scored 87% with this solution. The detected time complexity is O(N + M) but I get “Timeout Error” on the last check. Could you tell me why by any chance?

def solution(S, P, Q):

ans = []

for i,j in zip(P, Q):

if 'A' in S[i:j+1]:

ans.append(1)

elif 'C' in S[i:j+1]:

ans.append(2)

elif 'G' in S[i:j+1]:

ans.append(3)

elif 'T' in S[i:j+1]:

ans.append(4)

return ans

It’s O(N*M), not O(N+M). Because (‘A’ in S[i:j+1]) is O(N) in worst case.

Hi, I got 100% with a similar solution, also with “detected time complexity O(N + M)”:

def solution(S, P, Q):

query = []

for i, j in zip(P, Q):

sub_S = S[i:j+1]

if 'A' in sub_S:

query.append(1)

elif 'C' in sub_S:

query.append(2)

elif 'G' in sub_S:

query.append(3)

else:

query.append(4)

return query

Sheng, can you comment on why mine might have passed the performance checks? Should it not pass in general?
Thanks 🙂

The “detected time complexity” is not always accurate. Because “”‘A’ in sub_S” is an O(N) operation, I would take your solution as O(N*M).

Hi Sheng, i’ve made a similar code, that got 100/100 as well an it ilustrates pretty well what you’re trying to say:

def solution(S, P, Q):

values = {"A":1,"C":2,"G":3,"T":4}

ans = []

for i, j in zip(P, Q):

for v in values:

if v in S[i:j+1]:

ans.append(values[v])

break

return ans

It’s the exact same principle as the examples above, I just tried not to exceed in the if else repetition, but it does exactly the same thing.
On the worst case cenario, it is indeed O(N*M) since python uses linear search. If all the characters of the DNA were “T”, on the lasr position, it would be N*M, but since the avarege is lower than N*M it gets as N+M

Daniel Ribeiro says:
May 18, 2016 at 4:47 am
Here a nice PHP solution
Reply
- Sheng says:
  May 23, 2016 at 9:36 pm
  Please refer to “Guideline for Comments” (in the right column) to post code.
  Reply

Javascript code 100%

function solution(S, P, Q) {

// write your code in JavaScript (Node.js 4.0.0)

var matrix = {A:[0], C:[0], G:[0], T:[0]}

var result=[], i, k, p, q, val

for(i = 0; i<S.length; i++){

val = S[i];

if(i === 0) {

matrix[val][0]=1

continue

}

matrix.A[i] = matrix.A[i -1]

matrix.C[i] = matrix.C[i -1]

matrix.G[i] = matrix.G[i -1]

matrix.T[i] = matrix.T[i -1]

matrix[val][i]++

}

for(k =0; k<P.length; k++) {

p = P[k]

q = Q[k]

if(matrix.A[q] - matrix.A[p] > 0 || S[p] === 'A') {

result[k] = 1

}

else if(matrix.C[q] - matrix.C[p] > 0 || S[p] === 'C') {

result[k] = 2

}

else if(matrix.G[q] - matrix.G[p] > 0 || S[p] === 'G') {

result[k] = 3

}

else{

result[k] = 4

}

return result

}

Small improvement to @karol solution.

def prefix_sums(A, mapping):

n = len(A)

sums = [[0] * 4 for i in xrange(n + 1)]

for k in xrange(1, n + 1):

sums[k] = sums[k - 1][:]

sums[k][mapping[A[k - 1]] - 1] += 1

return sums

def get_min_impact(Qi, Pi):

slice_sum = [0] * len(Qi)

for i in xrange(len(Qi)):

if (Qi[i] - Pi[i]) > 0:

return i + 1

return slice_sum

def solution(S, P, Q):

mapping = {'A':1, 'C':2, 'G':3, 'T':4}

sums = prefix_sums(S, mapping)

result = []

for i in xrange(len(P)):

result.append(get_min_impact(sums[Q[i] + 1], sums[P[i]]))

return result

I came up with this.

def genomic_range_query(s, p, q):

impact = {'A':1, 'C':2, 'G': 3, 'T':4}

impacts = []

for r in [slice(start, end+1) for start, end in zip(p, q)]:

impacts.append(impact[min(s[r])])

return impacts

Another solution in python , got 100/100 on codility, many thanks to all the contributors especially Sheng

def solution(S, P, Q):

# write your code in Python 2.7

n = len(S)

A = [0] * n

C = [0] * n

G = [0] * n

T = [0] * n

for i in range(n):

if S[i] == 'A':

A[i] += 1

elif S[i] == 'C':

C[i] += 1

elif S[i] == 'G':

G[i] += 1

else:

T[i] += 1

P_A = prefix(A)

P_C = prefix(C)

P_G = prefix(G)

P_T = prefix(T)

result = []

for i, j in zip(P,Q):

if P_A[j+1] - P_A[i] > 0:

result.append(1)

elif P_C[j+1] - P_C[i] > 0:

result.append(2)

elif P_G[j+1] - P_G[i] > 0:

result.append(3)

else:

result.append(4)

return result

def prefix(l):

m = len(l)

P = [0]*(m+1)

for i in range(1,m+1):

P[i] = P[i-1] + l[i-1]

return P

Sheng says:
March 18, 2017 at 9:35 am
Your are welcome :–)
Reply

Another Javascript solution:

function solution(S, P, Q) {

var nucleo = {

"A": 1,

"C": 2,

"G": 3,

"T": 4

};

// build nucleoIndexInRange prefix array

var nucleoIndexInRange = [];

for (var i=0; i<S.length; i++) {

var nucleoIndex = nucleo[S[i]]-1;

// iterate through nucleo types

for (var j=0; j<4; j++) {

if (i == 0) {

nucleoIndexInRange[j] = [];

nucleoIndexInRange[j][0] = 0;

}

var increment = (j == nucleoIndex) ? 1 : 0;

nucleoIndexInRange[j][i+1] = nucleoIndexInRange[j][i] + increment;

}

// return values

var values = [];

for (var i=0; i<P.length; i++) {

var left = P[i];

var right = Q[i]+1;

// iterate through nucleo types

for (var j=0; j<4; j++) {

var nucleoSum = nucleoIndexInRange[j][right] - nucleoIndexInRange[j][left];

if (nucleoSum > 0) {

values[i] = j+1;

break;

}

return values;

}

Clean VB:

Private Function solution(S As String, P As Integer(), Q As Integer()) As Integer()

Dim Code As New Dictionary(Of String, Integer)

Code.Add("A", 1)

Code.Add("C", 2)

Code.Add("G", 3)

Code.Add("T", 4)

Dim ImpactSum(0 to S.Length, 0 to 3) As Integer

For Index as integer = 1 to S.Length

For ImpactIndex as Integer = 0 to 3

ImpactSum(Index, ImpactIndex) = ImpactSum(Index - 1, ImpactIndex)

ImpactSum(Index, Code(S.SubString(Index - 1, 1)) - 1) += 1

Dim MinimumImpact(0 to P.Length - 1) as Integer

For QueryIndex As Integer = 0 to P.Length - 1

For ImpactIndex as Integer = 3 to 0 Step -1

Dim NumberOfInstances as Integer = ImpactSum(Q(QueryIndex) + 1, ImpactIndex) - ImpactSum(P(QueryIndex), ImpactIndex)

If NumberOfInstances > 0 Then MinimumImpact(QueryIndex) = ImpactIndex + 1

return MinimumImpact

End Function

Hello,
I am more a C++ dev, but I tried it in python, and it also gets the holy 100/100.
I don’t understand why you don’t loop through the letter, as this is a fixed size loop it does not account in time diff, right ?
Also I find my cost function very ugly now that I see what you did :p.

def cost(S):

if S is 'A':

return 1

elif S is 'C':

return 2;

elif S is 'G':

return 3;

elif S is 'T':

return 4;

else:

return 0;

def solution(S, P, Q):

arrSPerNucl = []

lastFound = [-1, -1, -1, -1]

for i in range(0, len(S)) :

lastFound[cost(S[i]) - 1] = i

arrSPerNucl.append(list(lastFound))

result = []

for i in range(0, len(P)) :

for j in range(0, 4) :

if arrSPerNucl[Q[i]][j] >= P[i] :

result.append(j + 1)

break

return result

Sheng says:
November 30, 2017 at 8:04 am
Yes, it’s more a design style, rather than a performance concern 🙂
Reply

I’ve solved this using prime numbers, but I’m failing at performance tests. I can’t see why. my time complexity is O(N+M)

def solution(S, P, Q):

prefix_muls = []

last_mul = 1

for genom in S:

if genom == 'A':

last_mul *= 5

elif genom == 'C':

last_mul *= 3

elif genom == 'G':

last_mul *= 2

prefix_muls.append(last_mul)

for i in range(len(P)):

mul = prefix_muls[Q[i]]

if P[i]:

mul //= prefix_muls[P[i]-1]

if mul % 5 == 0: # A

P[i] = 1

elif mul % 3 == 0: # C

P[i] = 2

elif mul % 2 == 0: # G

P[i] = 3

else: # T

P[i] = 4

return P

Hi,
Here’s my Python 3.6 solution, I think it’s a bit simpler than what I’ve seen here.

def solution(S, P, Q):

impact = {'A': 1,

'C': 2,

'G': 3,

'T': 4}

res = []

for i in range(len(P)):

for k in impact:

if k in S[P[i]:Q[i]+1]:

res.append(impact[k])

break

return res

Doc says:
May 13, 2019 at 4:50 am
1
2
for k in impact: #it is O(M) for definite
if k in S[P[i]:Q[i]+1]: #it is O(N) in worst case

section
Reply

Javascript

function solution(S, P, Q) {

var nucleo = {

"A": 1,

"C": 2,

"G": 3,

"T": 4

};

const nucleoKeys = Object.keys(nucleo);

// Prefix sum is 0 + increment

const prefixSum = nucleoKeys.map(nucleo => { // x4

let sum = 0;

const prefixsums = ('0' + S).split('').map(x => sum += x==nucleo);

// Prefix sum have a prepended 0 to begin.

return prefixsums;

});

// prefixSum :

0,0,1,1,1,1,1,2

0,1,1,1,2,3,3,3

0,0,0,1,1,1,1,1

0,0,0,0,0,0,1,1

// We have to return P.length values, so let's map and see who should be returned

return P.map((start,i) => {

const end = Q[i]+1; // end goes at +1

const minNucleoNameInRange = nucleoKeys.find((nucleo,i) => {

// Find will stop at the first match found

// We try matches from value 1 to 4 using nucleoKeys

const prefixForNucleo = prefixSum[i];

const valueForNucleo = prefixForNucleo[end] - prefixForNucleo[start];

// If value > 0 it means the nucleo is in the slice we selected

// Otherwise the count would be the same:

// for counting A in: [A,B] -> (remember: we append a 0 in the prefix sum) -> [0,1,1] -> 1 - 0 -> 1

// for counting A in: [A,B,A] -> [0,1,1,2] -> 1

// for counting B in: [A,B,A] -> [0,0,1,1] -> 1

if (valueForNucleo > 0) return true;

});

return nucleo[minNucleoNameInRange];

});

}

console.log(solution('CAGCCTA', [2, 5, 0], [4, 5, 6]));

Hi Sheng

Thanks!

Your solution led me in right direction.

Take a look at this “optimization”.

def solution(S, P, Q):

# write your code in Python 3.6

map = {'A':0, 'C':1, 'G':2, 'T':3}

answer=[]

sum=[0]*4

Ssize=len(S)+1

I=[[0]*Ssize, [0]*Ssize, [0]*Ssize, [0]*Ssize]

for i in range (Ssize-1):

I[0][i]=sum[0]

I[1][i]=sum[1]

I[2][i]=sum[2]

I[3][i]=sum[3]

sum[map[S[i]]]+=1

I[0][Ssize-1]=sum[0]

I[1][Ssize-1]=sum[1]

I[2][Ssize-1]=sum[2]

I[3][Ssize-1]=sum[3]

for i in range (len(P)):

if I[0][Q[i]+1] > I[0][P[i]]:

answer.append(1)

elif I[1][Q[i]+1] > I[1][P[i]]:

answer.append(2)

elif I[2][Q[i]+1] > I[2][P[i]]:

answer.append(3)

elif I[3][Q[i]+1] > I[3][P[i]]:

answer.append(4)

return answer

#Detected time complexity: O(N + M)

# you can write to stdout for debugging purposes, e.g.

# print("this is a debug message")

def solution(S, P, Q):

# write your code in Python 3.6

result = []

for k in range (len(P)):

p = P[k]

q = Q[k]

v = S[p:q]

v += S[q]

if p == 0:

v += S[p]

if p == q:

v += S[q]

if 'A' in v:

result.append(1)

elif 'C' in v:

result.append(2)

elif 'G' in v:

result.append(3)

elif 'T' in v:

result.append(4)

return result

pass

I wrote below answer to this… would love some insights…

def solution(S, P, Q):

# write your code in Python 3.6

imp_fac = {'A':1, 'C':2, 'G':3, 'T':4}

M = len(P)

result = []

curr_result = None

for index in range(M):

lower = P[index]

upper = Q[index]

sub_s = S[lower:upper+1]

if 'A' in sub_s:

curr_result = imp_fac['A']

elif 'C' in sub_s:

curr_result = imp_fac['C']

elif 'G' in sub_s:

curr_result = imp_fac['G']

elif 'T' in sub_s:

curr_result = imp_fac['T']

if curr_result not in (1, 2, 3, 4):

result += [0]

else:

result += [curr_result]

return result

Hi,
How is code O(N*M)

def solution(S, P, Q):

# write your code in Python 3.6

gn_arr = []

nt = {'A':1,'C':2,'G':3,'T':4}

for code in S:

gn_arr.append(nt[code])

result = []

for index in range(0,len(P)):

result.append(min(gn_arr[P[index]:(Q[index]+1)]))

return (result)

pass

Here’s my solution, O(N + M):

def solution(S, P, Q):

# write your code in Python 3.6

N = len(S)

M = len(P) # also M = len(Q)

# calculate running impact sums at each string position

sumA = [0] * (N + 1)

sumC = [0] * (N + 1)

sumG = [0] * (N + 1)

curSumA = 0

curSumC = 0

curSumG = 0

for i, c in enumerate(S):

if (c == 'A'):

curSumA += 1

elif (c == 'C'):

curSumC += 1

elif (c == 'G'):

curSumG += 1

sumA[i + 1] = curSumA

sumC[i + 1] = curSumC

sumG[i + 1] = curSumG

# find lowest impact in each query range in string

results = [4] * M

for k in range(M):

p = P[k]

q = Q[k] + 1

if (sumA[q] - sumA[p] > 0):

results[k] = 1

elif (sumC[q] - sumC[p] > 0):

results[k] = 2

elif (sumG[q] - sumG[p] > 0):

results[k] = 3

return results

Jay Bariya says:
January 6, 2023 at 4:57 am
Pretty neat and easy to understand solution. Thanks.
Reply

Here is my simple python solution O(N+.M) 100%

def solution(S, P, Q):

# write your code in Python 3.6

result = []

for i in range(len(P)):

S_COPY = S[:]

eachResult = S_COPY[P[i]:Q[i] + 1]

if "A" in eachResult:

result.append(1)

elif "C" in eachResult:

result.append(2)

elif "G" in eachResult:

result.append(3)

else:

result.append(4)

return result

Mustafa Kirimli says:
December 29, 2019 at 6:04 am
shorter version:
1
2
3
4
5
def solution(S, P, Q):
  return [my_comp(S[p:q+1]) for p, q in zip(P, Q)]

def my_comp(arr):
  return 1 if 'A' in arr else 2 if 'C' in arr else 3 if 'G' in arr else 4

Reply

Here’s what I went with. Python. I went with a prefix sums approach.

def solution(S, P, Q):

nucleotides_index_map = {

'A': 0,

'C': 1,

'G': 2,

'T': 3

}

nucleotide_sums = [[0, 0, 0, 0]]

for index, nucleotide in enumerate(S):

current_sum = nucleotide_sums[index].copy()

current_sum[nucleotides_index_map[nucleotide]] += 1

nucleotide_sums.append(current_sum)

impact_factors = []

for lower_bound, upper_bound in zip(P, Q):

for factor, (upper_sum, lower_sum) in enumerate(zip(nucleotide_sums[upper_bound + 1], nucleotide_sums[lower_bound]), 1):

if upper_sum - lower_sum:

impact_factors.append(factor)

break

return impact_factors

My solution in C++

// you can use includes, for example:

// #include <algorithm>

#include <unordered_map>

// you can write to stdout for debugging purposes, e.g.

// cout << "this is a debug message" << endl;

static const std::unordered_map<char, int> impact{

{'A', 1},

{'C', 2},

{'G', 3},

{'T', 4}

};

auto prefix_sums(const std::string& A) noexcept -> decltype(auto) {

auto n = A.size();

std::vector<std::vector<int>> P(n+1, std::vector<int>(4, 0));

for (auto k{static_cast<size_t>(1)}; k <n+1; ++k) {

P[k] = P[k-1];

++P[k][impact.at(A[k-1])-1];

}

return P;

}

std::vector<int> solution(std::string &S, std::vector<int> &P, std::vector<int> &Q) {

// write your code in C++14 (g++ 6.2.0)

std::vector<int> result;

auto prefix = prefix_sums(S);

for (auto index{static_cast<size_t>(0)}; index < P.size(); ++index) {

if (prefix[Q[index] +1][0] - prefix[P[index]][0] != 0)

result.emplace_back(1);

else if (prefix[Q[index] +1][1] - prefix[P[index]][1] != 0)

result.emplace_back(2);

else if (prefix[Q[index] +1][2] - prefix[P[index]][2] != 0)

result.emplace_back(3);

else

result.emplace_back(4);

}

return result;

}

S = 'CAGCCTA'

P = [2,5,0]

Q = [4,5,6]

res_lst=[]

out_lst=[]

def solution(S,P,Q):

for i in range(len(P)):

res_lst = S[P[i]:Q[i]+1]

#print (res_lst)

if 'A' in res_lst:

out_lst.append(1)

elif 'C' in res_lst:

out_lst.append(2)

elif 'G' in res_lst:

out_lst.append(3)

elif 'T' in res_lst:

out_lst.append(4)

else:

return None

print (out_lst[0:(len(P))])

#return (out_lst[0:(len(P))])

solution(S,P,Q)

Received 100/100 with O(N+M).

def find_min(A):

if "A" in A:

return 1

elif "C" in A:

return 2

elif "G" in A:

return 3

else:

return 4

def solution(S, P, Q):

# write your code in Python 3.6

result = [4]*len(P)

for i in range(len(P)):

result[i] = find_min(S[P[i]:Q[i]+1])

return result

Same solution, but a bit more intuitively written (Javascript):

Follow-along video: https://www.youtube.com/watch?v=4SyckIAmYXk

function genomicRangeQuery(S, P, Q) {

let resultArr = [];

let scores = { A: 1, C: 2, G: 3, T: 4 };

// initializing array of zeroes for each neucleotide, of the same length as the string

let occurences = {

A: new Array(S.length).fill(0),

C: new Array(S.length).fill(0),

G: new Array(S.length).fill(0),

T: new Array(S.length).fill(0)

};

// replacing zero with one where the neucleotide occurs

for (let i = 0; i < S.length; i += 1) {

let val = S[i];

occurences[val][i] = 1;

}

// summing up values of preceeding elements, for each neucleotide's array

let a = c = g = t = 0;

for (let i = 0; i < S.length; i += 1) {

let val = S[i];

if (val === 'A') a += 1;

else if (val === 'C') c += 1;

else if (val === 'G') g += 1;

else if (val === 'T') t += 1;

occurences['A'][i] = a;

occurences['C'][i] = c;

occurences['G'][i] = g;

occurences['T'][i] = t;

}

// comparing positions in each neucleotide's array to check for differences i.e. occurences

for (let i = 0; i < P.length; i += 1) {

let posP = P[i];

let posQ = Q[i];

// edge case: if the position in P and Q are same, then that neucleotide is the only occurance

if (posP === posQ) {

if (S[posP] === 'A')

resultArr.push(scores['A']);

else if (S[posP] === 'C')

resultArr.push(scores['C']);

else if (S[posP] === 'G')

resultArr.push(scores['G']);

else if (S[posP] === 'T')

resultArr.push(scores['T']);

} else {

// edge case: this || S[posQ] === ... is to handle an edge case when the neucleotide occurs once at the first letter and never again

if (occurences['A'][posQ] > occurences['A'][posP] || S[posP] === 'A')

resultArr.push(scores['A']);

else if (occurences['C'][posQ] > occurences['C'][posP] || S[posP] === 'C')

resultArr.push(scores['C']);

else if (occurences['G'][posQ] > occurences['G'][posP] || S[posP] === 'G')

resultArr.push(scores['G']);

else if (occurences['T'][posQ] > occurences['T'][posP] || S[posP] === 'T')

resultArr.push(scores['T']);

}

return resultArr;

}

61 Replies to “Solution to Genomic-Range-Query by codility”

Leave a Reply Cancel reply