DS-String

Hashing

In place removal of duplicate in string
1. Create array having 1 element for each character.
  bool bin_hash[NO_OF_CHARS] = {0}; //NO_OF_CHARS=256
2. Create two index, in and out for input and output string. int in, out;
3. While input string does not reach null character, keep moving in index if hash[this_char]!=0. 
If hash[this_char] == '\0', make hash[this_char]=1 and move out and in index.

char *removeDuplictesInPlace(char *str)  
 {  
  bool bin_hash[NO_OF_CHARS] = {0};  
  int in = 0, out = 0;   char this_char;    
  while (*(str + in))  
  {  
   temp = *(str + in);  
   if (bin_hash[this_char;] == 0)  
   {     bin_hash[this_char;] = 1;  *(str + out) = *(str + in); out++;    }  
   out++;  
  }     
  *(str+out) = '\0';   
  return str;  
 }  

Think, how to remove only adjacent duplicates?
ThinkRemove 'a', 'ab' from given string in single pass with no extra space


Other Use cases:
Remove characters from first string which are present in second string.

int *getHashArray(char *str)
{
   int *count = (int *)calloc(sizeof(int), NO_OF_CHARS);
   int i;
   for (i = 0; *(str+i);  i++)      count[*(str+i)]++;
   return count;
}

char *removeCommonChars(char *str, char *str2)
{
  int *count  = getHashArray(str2); //create Hash for 2nd string

Rest code is same as above.

Check if strings are rotated versions
temp=strcat(str1,str2);
void *ptr=strstr(tmp, str2); If ptr==NULL string2 is rotation of string1
Example: 
str1="Hello" str2="lloHe"
tmp="HellolloHe", and strstr will show that str2 is substring of tmp.

Check if given string is rotation of a palindrome
E.g. ppppq is rotation of palindrome ppqpp

Idea: 
Create tmp=strcat(str, str); //contains str of len n repeated.
Find if there is palindromic substring of length n in tmp.

Permute string to generate all combinations [understand recursion]

void permute_str(char *str, int left, int right)  //O(n*n!)
{
  int i;
  if(left==right) printf("%s\n", str);
  else
  {
    for(i=left; i<=right; i++)
    {
      swap((str+left), (str+i));
      permute_str(str, left+1, right);
      swap((str+left), (str+i));  //this is backtracking step
    }
  }

}
Example: 
To understand what is happening, modified function to print call trace (bold below).
diff permutestr.c permutestr2.c                                                                                                      14,16c14,15
<   int i;
< printf("str:%s, left:%d, right:%d\t", str, left, right);
<   if(left==right) printf("\n\toutput %s\n", str);
---
>   int i;
>   if(left==right) printf("%s\n", str);
21c20
<       swap((str+left), (str+i)); printf("swap1(%d,%d): %s\n", left, i, str);
---
>       swap((str+left), (str+i));
23,24c22
<       swap((str+left), (str+i));  //this is backtracking step
<       printf("swap2(%d, %d): %s\n", left, i, str);
---
>       swap((str+left), (str+i));  //this is backtracking step
28d25

<
For Input string: PQ
str:PQ, left:0, right:1 swap1(0,0): PQ
str:PQ, left:1, right:1
        output PQ
swap2(0, 0): PQ
swap1(0,1): QP
str:QP, left:1, right:1
        output QP
swap2(0, 1): PQ
For Input string PQR
str:PQR, left:0, right:2        swap1(0,0): PQR
str:PQR, left:1, right:2        swap1(1,1): PQR
str:PQR, left:2, right:2
        output PQR
swap2(1, 1): PQR
swap1(1,2): PRQ
str:PRQ, left:2, right:2
        output PRQ
swap2(1, 2): PQR
swap2(0, 0): PQR
swap1(0,1): QPR
str:QPR, left:1, right:2        swap1(1,1): QPR
str:QPR, left:2, right:2
        output QPR
swap2(1, 1): QPR
swap1(1,2): QRP
str:QRP, left:2, right:2
        output QRP
swap2(1, 2): QPR
swap2(0, 1): PQR
swap1(0,2): RQP
str:RQP, left:1, right:2        swap1(1,1): RQP
str:RQP, left:2, right:2
        output RQP
swap2(1, 1): RQP
swap1(1,2): RPQ
str:RPQ, left:2, right:2
        output RPQ
swap2(1, 2): RQP
swap2(0, 2): PQR

Print combinations of string recursive
void Combinations(int depth, char *combi, int start, char *str)
   int len=strlen(str);
   for(i=start; i<len; i++)
      combi[depth]=str[i];
      combi[depth+1]='\0'
      printf("%s ", combi);
      if(i < len-1)  Combinations(depth+1, combi, start+1, str);

Print permutations of string recursive
void Permutations(int depth, char *perm, int start, char *str)
   int len=strlen(str);perm);
   if(depth==len) printf("%s", perm);
   else
      for(i=start; i<len; i++)
         if(used[i]==0)
            used[i]=1;
            perm[depth]=str[i];

            Combinations(depth+1, perm, start+1, str);
            used[i]=0;

First first unique character in string
- Create hash map, getHashArray containing count of occurrences of each character in string. 
- Get first character whose count is 1.
  for(i=0;*(str+i);i++) //for each char in str
    if(hash[*(str+i)] == 1) { index=i; break; }  //index in hash array. str[index]

Optimization: 2nd loop of for each char can be optimized. 
store index if this is first occurrence in getHashArray().
struct indexhash{ int count; int index; };
struct indexhash*getHashArray(char *str)
Inside for loop at end:
if (count[*(str+i)].count == 1)   count[*(str+i)].index = i;
int firstNonRepeating(char *str)
{
  struct indexhash *cnt = getCharCountArray(str);
  int result = INT_MAX, i;
  for (i = 0; i < NO_OF_CHARS;  i++)
    if (cnt[i].cnt == 1 && result > cnt[i].index)
       result = cnt[i].index;
  free(cnt); 
  return result;

}


Reverse Words in given string
Delhi is capital of India -> India of capital is Delhi.
Idea:
1. Reverse all letters of each word -> ihleD si latipac fo aidnI
2. Reverse entire string -> India of capital is Delhi.

Function to reverse any sequence [When string is editable]
void reverse(char *start, char *end)
{ if(str==NULL) || *str='\0') return str;
char tmp;   while(start<end){  tmp=*start; *start++=*end; *end--=tmp; }

void reverse_recursive(char *str, int start, int end)
{ char ch; 
   if(str==NULL) return;
   if(start>=end) return;
   ch = *(str+start);   *(str+start)=*(str+end);    *(str+end)=ch;
   reverse_recursive(str, ++start, --end);
}

When string is not editable, create a copy using malloc and return that. Ensure to free after use in caller.

void ReverseWordsInString(char *s)
{
  char *start=s, *tmp=s;
  while(*tmp)
  {
    tmp++;
    if(*tmp=='\0') reverse(start, tmp-1);
    else if(*tmp==' ') { reverse(start, tmp-1); start=tmp + 1;  }
  }
  reverse(s, tmp-1);

}

Find if two strings are anagram
Anagram: string that contains same set of characters, order can be different.
int FindIfAnagram(char *str1, char *str2)
{
  int hash1[256]={0}, hash2[256]={0}, idx;
  for(idx=0; str1[idx] && str2[idx]; idx++)
  { hash1[str1[idx]]++; hash2[str2[idx]]++;  }
  if( str1[idx] || str2[idx] )
    return 0; //Not anagram as length differ
  for(idx=0;idx<256;idx++)
    if(hash1[idx] != hash2[idx])
      return 0;
  return 1;

}


Find if given string str3 interleaves strings str1 and str2
str3 is interleaving str1 and str 2, iff it contains all characters of str1 and str2 and order of characters is not changed.
int IsInterleaving(char *str1, char *str2, char *str3)
{
  while(*str3!=0)  //Iterate string to check for interleaving
  {
    if(*str1 == *str3) str1++;  //Move str1 if matches
    else if(*str2 == *str3) str2++; /Move str1 if matches
    else return 0; //Matches with none, return not interleaving
    str3++; //Next character of string to check
  }
  if(*A || *B)
    return 0;
  return 1;
}

O(m+n) where m and n are the lengths of strings str2 and str2

Print Interleaving of String
void PrintInterleaves(char *str1, char *str2, char *istr, int m, int n, int i)
{
   if(m==0 && n==0) printf("%s\n", istr);
   if(m!=0){ istr[i]=str1[0]; PrintInterleaves(str1+1, str2, istr, m-1, n, i+1); }
   if(n!=0) { istr[i]=str2[0]; PrintInterleaves(str1, str2+1, istr, m, n-1, i+1); }
}
istr=malloc(m+1); istr[m+n]='\0';

Encode space with abc
Trick is to start from end.

void encode(char *str)
{
//Count number of spaces, return if no space
int newlen=len + numspace*3;
str[newlen]='\0';
for(i=strlength-1;i>=0;i--)
   if(str[i]==' ' || str[i]=='\t') 
      { str[newlen--]='a'; str[newlen--]='b';  str[newlen]='c'; }
   else str[newlen]--]=str[i];


Match wildcard in string

int match_wildcard(char *str1, char *str2)
{
  if(*str1=='\0' && *str2=='\0\')   return 1; //end of both strings, done

  //ensure chars after * are present in str2.
  if(*str1=='*' && *(str1+1)!='\0' && *str2=='\0')   return 0;
  
  //Either 1st char of str1 is ?, or current char of both strings match
  if(*str1=='?' || *str1==*str2) return match_wildcard(str1+1, str2+1);

  //For *, 2 cases. consider current char of str2, OR ignore current char of str2
  if(*str1=='*') return match_wildcard(str1+1, str2) || match_wildcard(str1, str2+1);

  return 0;
}

Remove Repeating Letters Recursively/AdjacentPairs
ABCCBCBA -> ABBCBA -> ACBA
int i, j;
for (i=1; i<=len; i++)
   while(str[i]==str[j] && j>=0)  //To cancel pairs
   {  i++; j--; }
   str[++j]=str[i];

Sliding Window

Find a window that has all characters being looked. 
1. Make hash[256] having count of each character to be found
2. Make hashfound[256] having count of characters found till now
Then keep moving window as much right as possible. Update if current window is min so far.

2D array, find if pattern is present in array
Match first element, 
When matched, match second in its 8 neighbours
  To avoid visiting twice, keep visited array.
If match fails, mark unvisited while backtracking

Longest Palindrome
sO(n) time: Build Suffix Tree and reverse tree => T$revT#
Find deepest node marked with both $ and #.

Longest Common Subsequence
Idea:
Input: A[0..m-1]  B[0..n-1]
Compute: LCS length L[ A[0..m-1], B[0..n-1])
Last Char Match: If A[m-1]=B[n-1], then
  L[ A[0..m-1], B[0..n-1])=1+ L[ A[0..m-2], B[0..n-2])
Last Char Doesn't Match: If A[m-1]!=B[n-1], then 
  L[ A[0..m-1], B[0..n-1])=max( L[ A[0..m-2], B[0..n-1]  , L[ A[0..m-1], B[0..n-2])
E.g.
1) A=PQRP, B=SPRP. 
Last char match: L(PQRP, SPRP)=1+ L(PQR, SPR)
2) A=PQRSVW, B=PTSUWX
Last char does not match: L(PQRSVW, PTSUWX)=max(L(PQRSV, PTSUWX), L(PQRSVW, PTSUW)

Ai, Bj both can not be in LCS. 

Traverse 2D array L to print subsequence:
    If char in A, B corresponding to L[i][j] , i.e. A[i-1]==B[j-1] are same then include this char in LPS
    Else compute value of L[i-1][j] and L[i][j-1] and move in greater value direction.

void compute_printLCSarray(char *A, char *B, int m, int n)
{
    int L[m+1][n+1], i, j;
    for(i=0;i<=m;i++)
        for(j=0;j<=n;j++)
            if(i==0|j==0) L[i][j]=0
            else if(A[i-1]==B[j-1]) L[i][j]=L[i-1][j-1]+1;
            else L[i][j]=max(L[i-1][j], L[i][j-1]);

    //To print
    int index=L[m][n];  char lcstring[index+1];    lcstring[index]='\0';
    i=m; j=n; 
    while(i>0 && j>0)
    {
        if(A[i-1]==B[j-1]){ lcstring[index-1]=A[i-1]; i--; j--; index--; }
        else if(L[i-1][j] > L[i][j-1])  i--;
        else j--;
    }
    printf("LCString: %s", lcstring);
}

Find first non-repeating character from stream in O(1) time


Move even positioned elements to end, maintain order, do inplace
p1q2r3s4t5u6 -> pqrst123456

Think, do reverse of it with same constraints.

RobinKarp
If h(P) != h(T), definitely pattern does not match text
If h(P) =  h(T), check AreEqual(P, T)

RabinKarp(T, P):
  p=bigprime
  x=rand(1, p-1)
  result=emptylist
  pHash=PolyHash(P, p, x)
  for i from 0 to len(T)-len(P):
    tHash=PolyHash(T, p, x)
    if(pHash!=tHash): continue
    if AreEqual(P, T):  result.Appent(i)
  return result

Average and Best Case O(n+m)
Worst Case O(nm) when P=AA, T=AAAAAAA types.

PseudoCode:
void rabinkarp(char T[], char P[], int prime)
{
  int lpat=strlen(P), ltxt=strlen(T), pHash=tHash=0;
  int h=pow(MAX_CHAR, lpat-1)%q;
  for (i = 0; i < lpat; i++) //calculate pat, txt hash for this window
  { 
    pHash = (MAX_CHAR*pHash + pat[i])%prime
    tHash = (MAX_CHAR*tHash + txt[i])%prime;
  }
  for (i = 0; i <= ltxt - lpat; i++) //slide pattern
  {
    if ( pHash == tHash ) { /*match character by character*/}
    if(i < ltxt-lpat)  //calculate hash  next window value, remove lead/trail digit
    {
      tHash=(MAX_CHAR*(tHash - txt[i]*h) + txt[i+lpat])%
prime;
      if(tHash<0)  tHash = (tHash + prime);
    }
  }
}

Boyer Moore Bad Character heuristic:
The character of text that does not match with current character of pattern is called bad character. When mismatches, slide pattern such that aligns bad char with its last occurance in pattern.
It starts matching from last char of pat.
Preprocess pattern and store last occurance of every possible character. If the char is not present at all, then slides by len of pattern.

Best Case O(n/m)
Worst Case O(nm) when P=AA, T=AAAAAAA types.

#define NUM_CHAR 256

void badCharHeuristic(char *s, int size, int badchar[256])
{
  int i; 
  for(i=0; i<size;i++) badchar[i]=-1;
  for(i=0; i<size;i++) badchar[(int)s[i]] = i;
}

void boyermoore(char *txt, char *pat)
{
  int ltxt=strlen(txt), lpat=strlen(pat), badchar[256], shift=0;
  badCharHeuristic(pat, lpat, badchar);

  while(shift <= (ltxt-lpat)   //shift of pattern wrt txt
  {
    int j=lpat-1;
    while(j>=0 && pat[j]==txt[shift+j]) j--; //reduce j while pat, txt match at this shift
    if(j<0)
    {//shift pat s.t. next char in txt aligns its last occurrence in pat
      printf("Pattern occurs at shift %d\n", shift);
      shift += (shift+lpat < ltxt) ? lpat - badchar[ txt[shift+lpat] ] : 1;
    }
    else
    {//shift pat s.t. bad char in txt aligns its last occurrence in pat
        shift += max(1, j-badchar[ txt[shift+j] ] )
    }
  }
}
shift+lpat < ltxt: for case when pat occurs at end of txt
max() ensures + shift. shift -ve when bad char in pat is on right of current char
KMP:
WorstCase O(n)
Preprocess pattern and generate lps[lpat].
lps is longest proper prefix of pat[0..i] which is also suffix of pat[0..i]

E.g.
PAT=AAAAA, lps[]={0,1,2,3,4}
PAT=AAABAAA, lps={0,1,2,0,1,2,3}
PAT=AAACAAAAAAC, lps={0,1,2,0,1,2,3,3,3,4}
Value of lps is used to detect next slide.

void computeLPS(char *pat, int M, int *lps)
{
  int len=0, i=1;
  lps[0]=0; //always

  while(i<M)  //calculate lps[i] for i=1..M-1.
  {
    if(pat[i] == pat[len])
    { len++; lps[i]=len; i++;}
    else
    {
      if(len!=0) len=lps[len-1];  //i not incremented, think AAACAAAA, i=7 ??
      else  { lps[i]=0; i++;
    } 
  }
}

void KMP(char *pat, char *txt)
{
  int lpat=strlen(pat), ltxt=strlen(txt);
  int *lps=(int*)malloc(sizeof(int) * lpat), i=0, j=0;
  computeLPS(pat, lpat, lps);

while(i<ltxt)
{

if(pat[j]==txt[i]){ j++; i++; }

if(j==lpat){ printf("Fount at %d\n", i-j); j=lps[j-1];}

else if(i<ltxt && pat[j]!=txt[i])
{
if(j!=0) j=lps[j-1];
else i=i+1;
}

}

free(lps);
}

Reference

No comments:

Post a Comment