gusucode.com > 采集网页选定部分全攻略C++源码程序 > 采集网页选定部分全攻略/collewpgtst2/test2/testa.cpp
// testa.cpp : Implementation of Ctesta #include "stdafx.h" #include "Test2.h" #include "testa.h" #include "Registry.h" #include <comutil.h> #include <direct.h> #include <string> #include <urlmon.h> using namespace std; const char debugpath[] = "d:\\info.txt"; ///////////////////////////////////////////////////////////////////////////// // Ctesta /***************************************************** 功能:将字符串保存为给定文件名 函数名:SaveCharToFile( const LPCTSTR data,//[IN] 给定待保存的数据 const LPCTSTR saveFileName,//[IN]给定文件名 BOOL flag = FALSE)//[IN]追加还是覆盖标志 *****************************************************/ inline BOOL SaveCharToFile(const LPCTSTR data,const LPCTSTR saveFileName,BOOL flag = FALSE) { DWORD numtowrite = 0; HANDLE hfile = CreateFile(saveFileName, GENERIC_WRITE, FILE_SHARE_WRITE, NULL, flag?CREATE_ALWAYS:OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); if(INVALID_HANDLE_VALUE == hfile){ return FALSE; } if(0xFFFFFFFF == SetFilePointer(hfile,0,0,FILE_END)){ return FALSE; } if(NULL == WriteFile(hfile,data,strlen(data),&numtowrite,NULL)) return FALSE; if(NULL == CloseHandle(hfile)) return FALSE; return TRUE; } /***************************************************** 功能:在给定字符串中查找给定字符串 函数名:const char *FindString( const LPCTSTR source,//[IN] 给定源字符串的数据 const LPCTSTR key)//[IN]待查字符串 *****************************************************/ inline const char *FindString(const LPCTSTR source,const LPCTSTR key) { if(strlen(source)<=0||strlen(key)<=0) return NULL; char* p1 = NULL; char* p2 = NULL; //copy source to a new pointer and lowercase char * Pointer = _strlwr(_strdup(source)); if(Pointer == NULL) return NULL; p1 = strstr(Pointer,key); if(p1 == NULL) return NULL; int n = p1-Pointer; free(Pointer); //return the true pointer offset return source+n; } /***************************************************** 功能:创建给定路径的所有未建目录 函数名:void CreateAllDirectory( const char* AllPath)//[IN] 需要创建的详细目录 例如 输入为:"d:\a\b\c\d"则在D盘下创建相应a,b,c,d相应目录 *****************************************************/ inline void CreateAllDirectory(const char* AllPath) { char _path[MAX_PATH]; strcpy(_path,AllPath); char tmp[MAX_PATH]; unsigned len = strlen( _path ); for ( unsigned i = 0; i < len - 1; i ++ ) { if ( _path[i] == '\\' ) { strcpy( tmp, _path ); tmp[i] = 0; _mkdir( tmp ); } } } /***************************************************** 功能:创建给定路径的所有未建目录 函数名:BOOL CheckData( //下载网页中所有图像文件 const LPCTSTR data,//网页脚本 const LPCTSTR host,//网站名 const LPCTSTR path,//存盘路径 DWORD Number,//存盘序号 std::string &outstring)//用于记录转换后的网页 返回 成功:返回TRUE,没有找到图像链接或者出现异常返回FALSE *****************************************************/ BOOL CheckData(const LPCTSTR data, const LPCTSTR host, const LPCTSTR path, DWORD Number, std::string &outstring) { char chImgPath[1024];//图像路径 !:由于有些图像路径可能会很长所以申请内存多一点 char chImgSrc[MAX_PATH];//图像地址 char chDownLoadPath[MAX_PATH];//下载图像文件路径 char chWriteImgSrc[MAX_PATH];//图像文件路径 memset(chImgPath,0,1024); memset(chImgSrc,0,MAX_PATH); memset(chDownLoadPath,0,MAX_PATH); memset(chWriteImgSrc,0,MAX_PATH); char dirname[20];//根目录名 ltoa(Number,dirname,10); try{ //查找是否有<img const char *p1 = FindString(data,"<img"); const char *p2 = FindString(p1,"src=\""); const char *p3 = FindString(p2+5,"\""); const char *p4 = FindString(p1,">"); if(p1 == NULL||p2 == NULL||p3 == NULL||p4 == NULL) return FALSE; //找到一个图像文件标记 //拷贝图像链接之前的文字 int n = p4-p1+1; strncpy(chImgPath,p1,n); outstring.append(data,p2-data+5); //提取图像链接 n = p3-p2-5; strncpy(chImgSrc,p2+5,n); if(FindString(chImgSrc,"http://") == NULL){ if(FindString(chImgSrc,"..")) strcpy(chImgSrc,&chImgSrc[2]); sprintf(chDownLoadPath,"%s%s",host,chImgSrc); sprintf(chWriteImgSrc,"%s//%s%s",path,dirname,chImgSrc); }else{ strcpy(chDownLoadPath,chImgSrc); const char *p5 = FindString(chImgSrc+7,"/"); sprintf(chWriteImgSrc,"%s\\%s%s",path,dirname,&chImgSrc[p5-chImgSrc]); } char Output[MAX_PATH]; sprintf(Output,"图像地址:%s\r\n存盘地址:%s\r\n主机地址:%s\r\n",chImgSrc,chWriteImgSrc,host); SaveCharToFile(Output,debugpath); n = strlen(chWriteImgSrc); for(int i=0;i<n;i++){ if(chWriteImgSrc[i] == '/') chWriteImgSrc[i] = '\\'; } //在下载之前先建立保存图像文件的路径 CreateAllDirectory(chWriteImgSrc); //下载图像文件 HRESULT hr = URLDownloadToFile( NULL, chDownLoadPath, chWriteImgSrc, 0, NULL); if( SUCCEEDED(hr))// { const char *p6 = FindString(chDownLoadPath+7,"/"); sprintf(chWriteImgSrc,"%s%s",dirname,&chDownLoadPath[p6-chDownLoadPath]); //将存盘的路径保存进字符串 }else{ strcpy(chWriteImgSrc,chDownLoadPath); //没有下载成功将原始路径保存进 } outstring.append(chWriteImgSrc); outstring.append(p3,p4-p3+1); BOOL ret = CheckData(p4+1,host,path,Number,outstring); if(!ret){ outstring.append(p4+1); } }catch(...){ return FALSE; } return TRUE; } //回调函数:找到IE类名的窗口指针 BOOL CALLBACK EnumChildProc(HWND hwnd,LPARAM lParam) { TCHAR buf[100]; ::GetClassName( hwnd, (LPTSTR)&buf, 100 ); if ( _tcscmp( buf, _T("Internet Explorer_Server") ) == 0 ) { *(HWND*)lParam = hwnd; return FALSE; } else return TRUE; }; //功能:保存当前网页中选定内容为文件 STDMETHODIMP Ctesta::GetHtmlText() { //保存网页内容的目录 char chFilePath[MAX_PATH]; DWORD Number = 0; CRegistry reg; reg.Open(HKEY_LOCAL_MACHINE,"SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\程序员加油站\\"); BOOL ret = reg.ReadDWORD("ArticleNumber",&Number); if(!ret) return S_FALSE; //读取保存网页文件的目录 ret = reg.ReadString("ArticlePath",chFilePath); if(!ret) return S_FALSE; //取得当前活动窗口的窗口句柄 HWND hWnd = GetActiveWindow(); CoInitialize( NULL ); //显式装载 MSAA 判断是否被安装 HINSTANCE hInst = ::LoadLibrary( _T("OLEACC.DLL") ); if ( hInst != NULL ) { if ( hWnd != NULL ) { HWND hWndChild=NULL; // 取得当前窗口的IE子窗口指针 ::EnumChildWindows( hWnd, EnumChildProc, (LPARAM)&hWndChild ); if ( hWndChild ) { //定义IE文档 CComPtr<IHTMLDocument2> pHTMLDoc; LRESULT lRes; //由于WM_HTML_GETOBJECT非Windows标准消息,所以需要RegisterWindowMessage UINT nMsg = ::RegisterWindowMessage( _T("WM_HTML_GETOBJECT") ); ::SendMessageTimeout( hWndChild, nMsg, 0L, 0L, SMTO_ABORTIFHUNG, 1000, (DWORD*)&lRes ); LPFNOBJECTFROMLRESULT pfObjectFromLresult = (LPFNOBJECTFROMLRESULT)::GetProcAddress( hInst, _T("ObjectFromLresult") ); if ( pfObjectFromLresult != NULL ) { HRESULT hr; //获取网页的IHTMLDocument2接口 hr = (*pfObjectFromLresult)( lRes, IID_IHTMLDocument, 0, (void**)&pHTMLDoc ); if ( SUCCEEDED(hr) ) { CComPtr<IHTMLSelectionObject> pSelObj; CComPtr<IHTMLTxtRange> pTxtRange; //根据IHTMLDocument2指针取得IHTMLSelectionObject接口指针 pHTMLDoc->get_selection(&pSelObj); //再获得IHTMLTxtRange指针 hr = pSelObj->createRange((IDispatch**)&pTxtRange); if(!CheckResult(hr,"pTxtRange")) return hr; //选择所有被选择的内容 pTxtRange->select(); BSTR bstrTxt,bstrTxt1; char strPath[MAX_PATH]; char *strTxt = NULL; char*strTxt1 = NULL; //取得主站域名 CComPtr<IHTMLLocation> pLocation; pHTMLDoc->get_location(&pLocation); pLocation->get_hostname(&bstrTxt1); strTxt1 = _com_util::ConvertBSTRToString(bstrTxt1); sprintf(strPath,"http://%s",strTxt1); SaveCharToFile(strTxt1,debugpath); //取得选中的内容 pTxtRange->get_htmlText(&bstrTxt); strTxt = _com_util::ConvertBSTRToString(bstrTxt); //下载内容中的图片资源,并修改相应链接 std::string webpage; char chSavePath[MAX_PATH]; sprintf(chSavePath,"%s\\T%ld.htm",chFilePath,Number); CreateAllDirectory(chSavePath); //取得所有图片资源并保存网页 if(CheckData(strTxt,strPath,chFilePath,Number,webpage) == FALSE) SaveCharToFile(strTxt,chSavePath,TRUE); else SaveCharToFile(webpage.c_str(),chSavePath,TRUE); BOOL ret = reg.WriteDWORD("ArticleNumber",++Number); //释放内存 if(strTxt1) delete[] strTxt1; if(strTxt) delete[] strTxt; SysFreeString(bstrTxt1); // 用完释放 SysFreeString(bstrTxt); // 用完释放 } } } } // else Internet Explorer is not running ::FreeLibrary( hInst ); } // else Active Accessibility is not installed CoUninitialize(); return S_OK; } //返回值调试输出函数 BOOL Ctesta::CheckResult(HRESULT hrs, LPCTSTR content) { if (!SUCCEEDED(hrs)){ char Info[MAX_PATH]; sprintf(Info,"%s Error = %ld\r\n",content,hrs); SaveCharToFile(Info,debugpath); return FALSE; } return TRUE; }