Wednesday, August 5, 2020

Personality_quiz_scraper

#include <iostream>
#include <stdio.h>
#include <fstream>
#include <string>
#include <string.h>
#include <sstream>
#include <windows.h>
#include <shellapi.h>
#include <dos.h>
#include <dirent.h>


using namespace std;
char* initializestrings(char* s, int l)
{
  for(int i=0;i<l;i++)
  {
   s[i]='\0';

  }
  return s;
}
string trim_spaces(string str)
{
    int y;
if(str[0] == ' ')
{
for(int i=0;i<str.size();i++)
{
if(str[i]==' ' && str[i+1] != ' ')
{   
    y=i+1;
break;
}
else 
continue;
}
str = str.substr(y, str.size()-y);
return str;
}
else
{
return str;
}
}
bool checkextraques(int a[], int x)
{
bool var = false;
for(int i=0;i<15;i++)
{
if(a[i]==x)
var = true;
}
return var;
}
string find_domain(string url)
{
int findslash;
findslash = url.find("/");
const char* temp = url.c_str();
url="";
for(int i=findslash+2;temp[i]!='/';i++)
{
url=url+temp[i];
}
return url;
}

string remove_double_quotes(string x)
{
int y=x.find("\"");
    while(y>=0)
    {
        x.replace(y,1,"'");
        y=x.find("\"");
       
    }
     return x;
}
bool is_file_exist(const char *fileName)
{
struct dirent *entry;
string contents[400];
int i=0;
    DIR *dir = opendir("C:/Users/USER/Desktop");
    while ((entry = readdir(dir)) != NULL)
{
    if(strcmpi(entry->d_name,fileName)==0)
    {
closedir(dir);
    return true;
    }
    }
    return false;
}
int main()
{
int dwiterator = 0;
int siteoption =0;
char ch;
string tempqa[1000], res[200], resv[500];
char thumbnailurl[400];
int qi, ri; qi = 0; ri = 0;
ifstream html[5];
ofstream pythonscript[100];
do
{

string url,domain,ques,ans;
int quesloop, ansloop; quesloop =0; ansloop =0;

  cout<<"Enter your URL\n";
  cin>>url;
  domain = find_domain(url);
  string alloweddomains[] = {"www.playbuzz.com", "play.howstuffworks.com", "www.zoo.com",
  "brainfall.com", "www.gotoquiz.com", "www.proprofs.com", "quizdoo.com", "altgalaxy.co", 
  "status4everyone.com", "www.magiquiz.com", "www.allthetests.com", "www.women.com", "www.beano.com","quizlady.com", "www.buildquizzes.com",
  "www.thequiz.com"};
 
  for(int i=0;i<16;i++)
  {
  if(domain==alloweddomains[i])
  {
  siteoption = i+1;
  break;
}
 
}
 
if(siteoption==0)
{
cout<<"Domain: " <<domain<<" This domain is invalid\n";
exit(0);
}
if(siteoption == 1)
{
ques = "pb-quiz-text-card embed-responsive-item\"";
ans = "answer-wrapper\"";
quesloop = 0;
ansloop = 0;
}
if(siteoption == 2 || siteoption == 3)
{
ques = "title-tertiary\"";
ans = "answer answer-default\"";
quesloop = 1;
ansloop = 1; 
}
if(siteoption == 4)
{
ques = "question_title\"";
ans = "answer noselect\"";
quesloop = 1;
ansloop = 1;
}
if(siteoption == 5)
{
ques = "<strong>";
ans = "type=\"radio\"";
quesloop = 1;
ansloop = 1;
}
if(siteoption == 6)
{
ques = "question-text\"";
ans = "opt_text\"";
quesloop = 1;
ansloop = 2;
}
if(siteoption == 7 || siteoption == 14)
{
ques="qc-question-text\"";
ans="qc-answer-text\"";
quesloop = 1;
ansloop = 1;
}
if(siteoption == 8)
{
ques="wpvqgr-question-label\"";
ans="wpvqgr-answer-label\"";
quesloop=1;
ansloop=1;
}
if(siteoption == 9)
{
ques="snax-quiz-question-title\"";
ans="snax-quiz-answer-label-text\"";
quesloop=1;
ansloop=1;
}
if(siteoption == 10)
{
ques="question-title overlay\"";
ans="fa fa-check\"";
quesloop = 1;
ansloop = 3;
}
if(siteoption == 11)
{
ques="class=\"questions";
ans="class=\"answer\">";
quesloop = 8;
ansloop = 2;
}
if(siteoption == 12)
{
ques = "\"caption_background\":";
ans = "\"title\":";
quesloop = 0;
ansloop = 0;
}
if(siteoption == 13)
{
ques = "\"QuizQuestionText-text-";
ans = "\"QuizAnswer-text-";
quesloop = 2;
ansloop = 1;
}
if(siteoption == 15)
{
ques = "<div class=\"qp_qi";
ans = "<input class=\"qp_i";
quesloop = 1;
ansloop = 1;
}
if(siteoption == 16)
{
ques="question-title\"";
ans="js-disabled";
quesloop = 1;
ansloop = 1;
}
pythonscript[dwiterator].open("pyparser.py");
string script = "from selenium import webdriver\nfrom selenium.webdriver.chrome.options import Options\noptions=Options()\nurl = ''\ndomain = ''\noptions.add_experimental_option( 'prefs',{'profile.managed_default_content_settings.javascript': 2})\ndriver = webdriver.Chrome('C:/Users/USER/Desktop/chromedriver.exe', options=options)\ndriver.get(url)\nraw_html=driver.page_source\ndriver.quit()\nfrom bs4 import BeautifulSoup as bs\nsoup = bs(raw_html,features='html.parser')\nif domain == 'www.women.com':\n    script_content = soup.find(id='wdc_quiz_data_json')\n    str_script_content = str(script_content)\n    str_script_content = str_script_content[49:len(str_script_content) - 9]\n    import json\n    str_script_content = (json.dumps(json.loads(str_script_content), indent=2))\n    with open('C:/Users/USER/Desktop/to_reverse.txt', 'w', encoding='utf-8') as f:\n        f.write(str_script_content)\n    with open('C:/Users/USER/Desktop/to_reverse.txt') as f, open('C:/Users/USER/Desktop/htmlparsing.txt', 'w') as fout:\n        fout.writelines(reversed(f.readlines()))\nelse:\n    prettyHTML = soup.prettify()\n    with open('C:/Users/USER/Desktop/htmlparsing.txt', 'w', encoding='utf-8') as f:\n        f.write(prettyHTML.replace('&amp;', '&'))";

int findurl=script.find("url = ");
int finddomain=script.find("domain = ");
script.insert(finddomain+10, domain);
script.insert(findurl+7, url);
pythonscript[dwiterator]<<script;
pythonscript[dwiterator].close();
ShellExecuteA(NULL, "open", "pyparser.py", NULL, NULL, SW_SHOWNORMAL);
while(!is_file_exist("htmlparsing.txt"))
{
Sleep(2000);
}
Sleep(3000);
html[dwiterator].open("htmlparsing.txt");
string line;
while(!html[dwiterator].eof())
{
  line = "";
  getline(html[dwiterator], line);
  int y=0;
  string x; x="";
  int findques = 0;
  findques = line.find(ques);
  if(findques>0)
  {   
  if(siteoption == 1)
  {
  for(int i=1;i<=7;i++)
  {
  line="";
  getline(html[dwiterator], line);
  int pfindques = line.find("<p>");
  if(pfindques>=0)
  {
  line="";
getline(html[dwiterator], line);
break;
}
}
}
if(siteoption == 12)
{
line = "";
      getline(html[dwiterator], line);
  int pfind; string temp; temp="";
  pfind = line.find("\": ");
  for(int i=pfind+4; line[i]!='"';i++)
  temp = temp+line[i];
  line = "";
  line=temp; 
}
else 
{
      for(int i=1;i<=quesloop;i++)
      {
line = "";
      getline(html[dwiterator], line);
  }
  }
  x=trim_spaces(line);
  cout<<"question "<<x<<"\n";
    x=remove_double_quotes(x);
    x.insert(0, "Q\"");
    x.append("\"");
    tempqa[qi] = x;
    qi++;
  y=1;
  }

  if(y==0)
  {
  int findans = 0;
  findans = line.find(ans);
   if(findans>0)
   {
    if(siteoption == 1)
    {
    for(int i=1;i<=20;i++)
  {
  line="";
  getline(html[dwiterator], line);
  int pfindans = line.find("<p>");
  if(pfindans>=0)
  {
  line="";
getline(html[dwiterator], line);
break;
}
}
}
if(siteoption == 12)
{
string temp; temp="";
getline(html[dwiterator], temp);
int find_res_wt;
find_res_wt = temp.find("\"result_weight\":");
if(find_res_wt>0)
{
int pfind;
temp="";
pfind = line.find("\": ");
for(int i=pfind+4; line[i]!='"';i++)
  temp = temp+line[i];
  line = "";
  line=temp;
}
else
continue;
}
      else
{
  for(int i=1;i<=ansloop;i++)
    {
    line = "";
      getline(html[dwiterator], line);
      }
  }
    x=trim_spaces(line);
    x=remove_double_quotes(x);
    x.insert(0, "A\"");
  tempqa[qi] = x;
  qi++;
    y=1;
  }
  
  }
 
}
html[dwiterator].close();
Sleep(3000);
remove("pyparser.py");

remove("htmlparsing.txt");

++dwiterator;
if(dwiterator<5)
{
cout<<"Do you want to continue? (Press 'Y' to continue and 'N' to exit)\n";
cin>>ch;
}
}
while(toupper(ch)!='N' && dwiterator<5);




ofstream allquestions;
allquestions.open("allquestions.txt");
for(int i=0;i<qi;i++)
{
char ch = tempqa[i].c_str()[0];
if(ch=='Q')
allquestions<<"Index of Question "<<i<<" "<<tempqa[i]<<"\n";
else
allquestions<<tempqa[i]<<"\n";
}
allquestions.close();
/*int allquestionsindex[15];
    cout<<"Enter the index of all questions which you want to have:\n";
    for(int i=0;i<15;i++)
    cin>>allquestionsindex[i];*/




string mac[1000]; int y=0; int k=0;
for(int i=0;i<qi-1;i++)
{
char ch = tempqa[i].c_str()[0];
if(ch == 'Q' && k<15 && tempqa[i+1].c_str()[0]=='A')
{
mac[y] = tempqa[i];
y++;
for(int j=i+1;tempqa[j].c_str()[0]=='A';j++)
{
mac[y]=tempqa[j];
y++;
}
k++;
}
}
int numofprsn; 
char pd;
string dummy="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec fringilla placerat turpis eget imperdiet. Cras a dignissim tortor. Nunc et gravida nisi, et dignissim ex. Cras quis sem felis. Aenean ultricies finibus magna ut interdum. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Aenean et.";
dummy.insert(0,"\"");
dummy.append("\"");
cout<<"How many personalities are there?\n";
cin>>numofprsn;
cout<<"Do you want to enter the personality Description? (Enter 'Y' for Yes and 'N' for No)\n";
cin>>pd;
for(int i=1;i<=numofprsn;i++)
{
cout<<"Enter the personality\n";
fflush(stdin);
getline(cin, res[ri]);
res[ri]=remove_double_quotes(res[ri]);
res[ri].insert(0,"\"");
res[ri].append("\"");
ri++;
if(toupper(pd) == 'Y')
{
cout<<"Enter the personality description\n";
fflush(stdin);
getline(cin, res[ri]);
res[ri]=remove_double_quotes(res[ri]);
res[ri].insert(0,"\"");
res[ri].append("\"");
    }
    else
    {
    fflush(stdin);
    res[ri]=dummy;
}
ri++;
}

int pi=0;
for(int i=0;i<y;i++)
{
    const char* th=mac[i].c_str();
    if(th[0]=='A')
    {
        string x,y,z;
        x=th;
        z=res[pi];
        y=z.replace(0,1,"#");
        x.append(y);
        cout<<x<<"\n";
        mac[i]=x;
        pi=pi+2;
        if(pi==ri)
        pi=0;
    }
}


ofstream csv;
string qcsv = "personality_quiz.csv";
csv.open(qcsv);
string quiz_title;
cout<<"Enter the title of this quiz\n";
getline(cin, quiz_title);
quiz_title=remove_double_quotes(quiz_title);
quiz_title.insert(0,"\"");
quiz_title.append("\"");
string first_row[25] = {"quiz_title", "Type", "personality", "personality_desc", "question", "answer_1", "answer_2", "answer_3", "answer_4", "answer_5", "answer_6",
"answer_7", "answer_8", "answer_9", "answer_10", "answer_11", "answer_12", "answer_13", "answer_14", "answer_15", "answer_16", "answer_17",
    "answer_18", "answer_19", "answer_20"};

int total_rows;
if((k-1)>=(ri/2))
total_rows = k;
else
total_rows = (ri/2);
int rt, qt; rt=2; qt=0;
for(int t=0;t<25;t++)
csv<<first_row[t]<<",";
csv<<"\n";
for(int i=1;i<=total_rows;i++)
{

if(i==1)
{
csv<<quiz_title<<","<<"personality"<<","<<res[0]<<","<<res[1]<<",";
}
else
{
if(rt<=ri-2)
{
csv<<""<<","<<""<<","<<res[rt]<<","<<res[rt+1]<<",";
rt+=2;
}
else
{
for(int t=1;t<=4;t++)
csv<<""<<",";
}
}
csv<<mac[qt].substr(1,mac[qt].size()-1)<<",";
int j;
for(j=qt+1;mac[j].c_str()[0]=='A';j++)
{
csv<<mac[j].substr(1,mac[j].size()-1)<<",";
}
qt=j;
csv<<"\n";
    

}


csv.close();


return 0;
}

No comments:

Post a Comment