www.gusucode.com > 采集网页选定部分全攻略C++源码程序 > 采集网页选定部分全攻略/collewpgtst2/test2/testa.cpp

    // testa.cpp : Implementation of Ctesta
#include "stdafx.h"
#include "Test2.h"
#include "testa.h"
#include "Registry.h"
#include <comutil.h>
#include <direct.h>
#include <string>
#include <urlmon.h>

using namespace std;
const char debugpath[] = "d:\\info.txt";
/////////////////////////////////////////////////////////////////////////////
// Ctesta
/*****************************************************
功能:将字符串保存为给定文件名
函数名:SaveCharToFile(
const LPCTSTR data,//[IN] 给定待保存的数据
const LPCTSTR saveFileName,//[IN]给定文件名
BOOL flag = FALSE)//[IN]追加还是覆盖标志
*****************************************************/
inline BOOL SaveCharToFile(const LPCTSTR data,const LPCTSTR saveFileName,BOOL flag = FALSE)
{
	DWORD numtowrite = 0;
	HANDLE hfile = CreateFile(saveFileName,
		GENERIC_WRITE,
		FILE_SHARE_WRITE,
		NULL,
		flag?CREATE_ALWAYS:OPEN_ALWAYS,
		FILE_ATTRIBUTE_NORMAL,
		NULL);
	if(INVALID_HANDLE_VALUE == hfile){
		return FALSE;
	}
	if(0xFFFFFFFF == SetFilePointer(hfile,0,0,FILE_END)){
		return FALSE;
	}
	if(NULL == WriteFile(hfile,data,strlen(data),&numtowrite,NULL))
		return FALSE;
	if(NULL == CloseHandle(hfile))
		return FALSE;
	return TRUE;
}
/*****************************************************
功能:在给定字符串中查找给定字符串
函数名:const char *FindString(
const LPCTSTR source,//[IN] 给定源字符串的数据
const LPCTSTR key)//[IN]待查字符串
*****************************************************/
inline const char *FindString(const LPCTSTR source,const LPCTSTR key)
{
	if(strlen(source)<=0||strlen(key)<=0)
		return NULL;
	char* p1 = NULL;
	char* p2 = NULL;

	//copy source to a new pointer and lowercase 
	char * Pointer = _strlwr(_strdup(source));

	if(Pointer == NULL)
		return NULL;

	p1 = strstr(Pointer,key);

	if(p1 == NULL)
		return NULL;
	int n = p1-Pointer;

	free(Pointer);

	//return the true pointer offset
	return source+n;
}
/*****************************************************
功能:创建给定路径的所有未建目录
函数名:void CreateAllDirectory(
const char* AllPath)//[IN] 需要创建的详细目录
例如 输入为:"d:\a\b\c\d"则在D盘下创建相应a,b,c,d相应目录
*****************************************************/
inline void CreateAllDirectory(const char* AllPath)
{
	char _path[MAX_PATH];
	strcpy(_path,AllPath);
	char tmp[MAX_PATH];
	unsigned len = strlen( _path );
	for ( unsigned i = 0; i < len - 1; i ++ ) {
		if ( _path[i] == '\\' ) {
			strcpy( tmp, _path );
			tmp[i] = 0;
			_mkdir( tmp );
		}
	}
}
/*****************************************************
功能:创建给定路径的所有未建目录
函数名:BOOL CheckData( //下载网页中所有图像文件
			   const LPCTSTR data,//网页脚本
			   const LPCTSTR host,//网站名
			   const LPCTSTR path,//存盘路径
			   DWORD Number,//存盘序号
			   std::string &outstring)//用于记录转换后的网页
返回 成功:返回TRUE,没有找到图像链接或者出现异常返回FALSE
*****************************************************/
BOOL CheckData(const LPCTSTR data,
			   const LPCTSTR host,
			   const LPCTSTR path,
			   DWORD Number,
			   std::string &outstring)
{
	char chImgPath[1024];//图像路径 !:由于有些图像路径可能会很长所以申请内存多一点
	char chImgSrc[MAX_PATH];//图像地址
	char chDownLoadPath[MAX_PATH];//下载图像文件路径
	char chWriteImgSrc[MAX_PATH];//图像文件路径

	memset(chImgPath,0,1024);
	memset(chImgSrc,0,MAX_PATH);
	memset(chDownLoadPath,0,MAX_PATH);
	memset(chWriteImgSrc,0,MAX_PATH);

	char dirname[20];//根目录名
	ltoa(Number,dirname,10);

	try{
		//查找是否有<img
		const char *p1 = FindString(data,"<img");
		const char *p2 = FindString(p1,"src=\"");
		const char *p3 = FindString(p2+5,"\"");
		const char *p4 = FindString(p1,">");
		if(p1 == NULL||p2 == NULL||p3 == NULL||p4 == NULL)
			return FALSE;

		//找到一个图像文件标记

		//拷贝图像链接之前的文字
		int n = p4-p1+1;

		strncpy(chImgPath,p1,n);
		outstring.append(data,p2-data+5);

		//提取图像链接

		n = p3-p2-5;
		strncpy(chImgSrc,p2+5,n);
		if(FindString(chImgSrc,"http://") == NULL){
			if(FindString(chImgSrc,".."))
				strcpy(chImgSrc,&chImgSrc[2]);
			sprintf(chDownLoadPath,"%s%s",host,chImgSrc);
			sprintf(chWriteImgSrc,"%s//%s%s",path,dirname,chImgSrc);
		}else{
			strcpy(chDownLoadPath,chImgSrc);
			const char *p5 = FindString(chImgSrc+7,"/");
			sprintf(chWriteImgSrc,"%s\\%s%s",path,dirname,&chImgSrc[p5-chImgSrc]);
		}
		char Output[MAX_PATH];
		sprintf(Output,"图像地址:%s\r\n存盘地址:%s\r\n主机地址:%s\r\n",chImgSrc,chWriteImgSrc,host);
		SaveCharToFile(Output,debugpath);
		n = strlen(chWriteImgSrc);
		for(int i=0;i<n;i++){
			if(chWriteImgSrc[i] == '/')
				chWriteImgSrc[i] = '\\';
		}

		//在下载之前先建立保存图像文件的路径
		CreateAllDirectory(chWriteImgSrc);

		//下载图像文件
		HRESULT hr = URLDownloadToFile( NULL, chDownLoadPath, chWriteImgSrc, 0, NULL);
		if( SUCCEEDED(hr))//
		{
			const char *p6 = FindString(chDownLoadPath+7,"/");
			sprintf(chWriteImgSrc,"%s%s",dirname,&chDownLoadPath[p6-chDownLoadPath]);
			//将存盘的路径保存进字符串
		}else{
			strcpy(chWriteImgSrc,chDownLoadPath);
			//没有下载成功将原始路径保存进
		}
		outstring.append(chWriteImgSrc);
		outstring.append(p3,p4-p3+1);
		BOOL  ret = CheckData(p4+1,host,path,Number,outstring);
		if(!ret){
			outstring.append(p4+1);
		}
	}catch(...){
		return FALSE;
	}
	return TRUE;
}
//回调函数:找到IE类名的窗口指针
BOOL CALLBACK EnumChildProc(HWND hwnd,LPARAM lParam)
{
	TCHAR	buf[100];

	::GetClassName( hwnd, (LPTSTR)&buf, 100 );
	if ( _tcscmp( buf, _T("Internet Explorer_Server") ) == 0 )
	{
		*(HWND*)lParam = hwnd;
		return FALSE;
	}
	else
		return TRUE;
};
//功能:保存当前网页中选定内容为文件
STDMETHODIMP Ctesta::GetHtmlText()
{
	//保存网页内容的目录
	char chFilePath[MAX_PATH];
	DWORD Number = 0;

	CRegistry reg;
	reg.Open(HKEY_LOCAL_MACHINE,"SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\程序员加油站\\");
	BOOL ret = reg.ReadDWORD("ArticleNumber",&Number); 
	if(!ret)
		return S_FALSE;

	//读取保存网页文件的目录
	ret = reg.ReadString("ArticlePath",chFilePath);
	if(!ret)
		return S_FALSE;

	//取得当前活动窗口的窗口句柄
	HWND hWnd = GetActiveWindow();


	CoInitialize( NULL );
	
	//显式装载 MSAA 判断是否被安装
	HINSTANCE hInst = ::LoadLibrary( _T("OLEACC.DLL") );
	if ( hInst != NULL )
	{
		if ( hWnd != NULL )
		{
			HWND hWndChild=NULL;
			// 取得当前窗口的IE子窗口指针
			::EnumChildWindows( hWnd, EnumChildProc, (LPARAM)&hWndChild );
			if ( hWndChild )
			{
				//定义IE文档
				CComPtr<IHTMLDocument2> pHTMLDoc;
				LRESULT lRes;
				
				//由于WM_HTML_GETOBJECT非Windows标准消息,所以需要RegisterWindowMessage
				UINT nMsg = ::RegisterWindowMessage( _T("WM_HTML_GETOBJECT") );
				::SendMessageTimeout( hWndChild, nMsg, 0L, 0L, SMTO_ABORTIFHUNG, 1000, (DWORD*)&lRes );
				
				LPFNOBJECTFROMLRESULT pfObjectFromLresult = (LPFNOBJECTFROMLRESULT)::GetProcAddress( hInst, _T("ObjectFromLresult") );
				if ( pfObjectFromLresult != NULL )
				{
					HRESULT hr;
					//获取网页的IHTMLDocument2接口
					hr = (*pfObjectFromLresult)( lRes, IID_IHTMLDocument, 0, (void**)&pHTMLDoc );
					if ( SUCCEEDED(hr) )
					{
						CComPtr<IHTMLSelectionObject> pSelObj;
						CComPtr<IHTMLTxtRange> pTxtRange;

						//根据IHTMLDocument2指针取得IHTMLSelectionObject接口指针
						pHTMLDoc->get_selection(&pSelObj);
						//再获得IHTMLTxtRange指针
						hr = pSelObj->createRange((IDispatch**)&pTxtRange);
						if(!CheckResult(hr,"pTxtRange"))
							return hr;
						//选择所有被选择的内容
						pTxtRange->select();

						BSTR bstrTxt,bstrTxt1;
						char strPath[MAX_PATH];
						char *strTxt = NULL;
						char*strTxt1 = NULL;

						//取得主站域名
						CComPtr<IHTMLLocation> pLocation;
						pHTMLDoc->get_location(&pLocation);
						pLocation->get_hostname(&bstrTxt1);
						strTxt1 = _com_util::ConvertBSTRToString(bstrTxt1);

						sprintf(strPath,"http://%s",strTxt1);
						SaveCharToFile(strTxt1,debugpath);

						//取得选中的内容
						pTxtRange->get_htmlText(&bstrTxt);
						strTxt = _com_util::ConvertBSTRToString(bstrTxt);

						//下载内容中的图片资源,并修改相应链接
						std::string webpage;
						char chSavePath[MAX_PATH];
						sprintf(chSavePath,"%s\\T%ld.htm",chFilePath,Number);
						CreateAllDirectory(chSavePath);
						
						//取得所有图片资源并保存网页
						if(CheckData(strTxt,strPath,chFilePath,Number,webpage) == FALSE)
							SaveCharToFile(strTxt,chSavePath,TRUE);
						else
							SaveCharToFile(webpage.c_str(),chSavePath,TRUE);
						BOOL ret = reg.WriteDWORD("ArticleNumber",++Number);
						//释放内存
						if(strTxt1)
							delete[] strTxt1;
						if(strTxt)
							delete[] strTxt;
						SysFreeString(bstrTxt1); // 用完释放
						SysFreeString(bstrTxt); // 用完释放
					}
				}
			}
		} // else Internet Explorer is not running
		::FreeLibrary( hInst );
	} // else Active Accessibility is not installed
	CoUninitialize();

	return S_OK;
}
//返回值调试输出函数
BOOL Ctesta::CheckResult(HRESULT hrs, LPCTSTR content)
{
	if (!SUCCEEDED(hrs)){
		char Info[MAX_PATH];
		sprintf(Info,"%s Error = %ld\r\n",content,hrs);
		SaveCharToFile(Info,debugpath);
		return FALSE;
	}
	return TRUE;
}