vbAccelerator - Contents of code file: IFilter_TextFilter.csusing System;
using System.Runtime.InteropServices;
using System.Text;
using Microsoft.Win32;
using System.Collections ;
namespace vbAccelerator.Components.TextFilter
{
#region Public IFilter Enumerations
/// <summary>
/// Flags controlling the operation of the TextFilter
/// instance.
/// </summary>
[Flags]
public enum IFILTER_INIT : int
{
/// <summary>
/// Paragraph breaks should be marked with the Unicode PARAGRAPH
SEPARATOR (0x2029)
/// </summary>
IFILTER_INIT_CANON_PARAGRAPHS = 1,
/// <summary>
/// Soft returns, such as the newline character in Microsoft Word, should
be replaced by hard returnsLINE SEPARATOR (0x2028). Existing hard
returns can be doubled. A carriage return (0x000D), line feed (0x000A),
or the carriage return and line feed in combination should be considered
a hard return. The intent is to enable pattern-expression matches that
match against observed line breaks.
/// </summary>
IFILTER_INIT_HARD_LINE_BREAKS = 2,
/// <summary>
/// Various word-processing programs have forms of hyphens that are not
represented in the host character set, such as optional hyphens
(appearing only at the end of a line) and nonbreaking hyphens. This flag
indicates that optional hyphens are to be converted to nulls, and
non-breaking hyphens are to be converted to normal hyphens (0x2010), or
HYPHEN-MINUSES (0x002D).
/// </summary>
IFILTER_INIT_CANON_HYPHENS = 4,
/// <summary>
/// Just as the IFILTER_INIT_CANON_HYPHENS flag standardizes hyphens,
this one standardizes spaces. All special space characters, such as
nonbreaking spaces, are converted to the standard space character
(0x0020).
/// </summary>
IFILTER_INIT_CANON_SPACES = 8,
/// <summary>
/// Indicates that the client wants text split into chunks representing
internal value-type properties.
/// </summary>
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16,
/// <summary>
/// Indicates that the client wants text split into chunks representing
properties determined during the indexing process.
/// </summary>
IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 256,
/// <summary>
/// Any properties not covered by the IFILTER_INIT_APPLY_INDEX_ATTRIBUTES
and IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES flags should be emitted.
/// </summary>
IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32,
/// <summary>
/// Optimizes IFilter for indexing because the client calls the
IFilter::Init method only once and does not call IFilter::BindRegion.
This eliminates the possibility of accessing a chunk both before and
after accessing another chunk.
/// </summary>
IFILTER_INIT_INDEXING_ONLY = 64,
/// <summary>
/// The text extraction process must recursively search all linked
objects within the document. If a link is unavailable, the
IFilter::GetChunk call that would have obtained the first chunk of the
link should return FILTER_E_LINK_UNAVAILABLE.
/// </summary>
IFILTER_INIT_SEARCH_LINKS = 128,
/// <summary>
/// The content indexing process can return property values set by the
filter.
/// </summary>
IFILTER_INIT_FILTER_OWNED_VALUE_OK = 512
}
/// <summary>
/// Standard property id definitions, from OLE2 documentation.
/// </summary>
public enum PROPID : int
{
PID_UNKNOWN = -1,
// integer count + array of entries
PID_DICTIONARY = 0 ,
/// <summary>
/// Document Code Page, short integer
/// </summary>
PID_CODEPAGE = 1,
/// <summary>
/// Document title, string
/// </summary>
PID_TITLE = 2,
/// <summary>
/// Subject, string
/// </summary>
PID_SUBJECT =3 ,
/// <summary>
/// Author, string
/// </summary>
PID_AUTHOR =4 ,
/// <summary>
/// Keywords, string
/// </summary>
PID_KEYWORDS =5 ,
/// <summary>
/// Comments, string
/// </summary>
PID_COMMENTS =6 ,
/// <summary>
/// Template name, string
/// </summary>
PID_TEMPLATE =7 ,
/// <summary>
/// Last Author, string
/// </summary>
PID_LASTAUTHOR =8 ,
/// <summary>
/// Revision Number, string
/// </summary>
PID_REVNUMBER =9 ,
/// <summary>
/// Edit Date Time, DateTime
/// </summary>
PID_EDITTIME =10 ,
/// <summary>
/// Last Printed, DateTime
/// </summary>
PID_LASTPRINTED =11 ,
/// <summary>
/// Create date time, DateTime
/// </summary>
PID_CREATE_DTM =12 ,
/// <summary>
/// Last save date time, DateTime
/// </summary>
PID_LASTSAVE_DTM =13 ,
/// <summary>
/// Page count, integer
/// </summary>
PID_PAGECOUNT =14 ,
/// <summary>
/// Word count, integer
/// </summary>
PID_WORDCOUNT =15 ,
/// <summary>
/// Character count, integer
/// </summary>
PID_CHARCOUNT =16 ,
/// <summary>
/// Thumbnail, clipboard format + metafile/bitmap (not supported)
/// </summary>
PID_THUMBNAIL =17 ,
/// <summary>
/// App used for creation, string
/// </summary>
PID_APPNAME =18 ,
/// <summary>
/// Security, integer
/// </summary>
PID_SECURITY =19
}
/// <summary>
/// Enumerates the different breaking types that occur between
/// chunks of text read out by the TextFilter.
/// </summary>
public enum CHUNK_BREAKTYPE : int
{
/// <summary>
/// No break is placed between the current chunk and the previous chunk.
The chunks are glued together.
/// </summary>
CHUNK_NO_BREAK = 0,
/// <summary>
/// A word break is placed between this chunk and the previous chunk that
had the same attribute.
/// Use of CHUNK_EOW should be minimized because the choice of word
breaks is language-dependent,
/// so determining word breaks is best left to the search engine.
/// </summary>
CHUNK_EOW = 1,
/// <summary>
/// A sentence break is placed between this chunk and the previous chunk
that had the same attribute.
/// </summary>
CHUNK_EOS = 2,
/// <summary>
/// A paragraph break is placed between this chunk and the previous chunk
that had the same attribute.
/// </summary>
CHUNK_EOP = 3,
/// <summary>
/// A chapter break is placed between this chunk and the previous chunk
that had the same attribute.
/// </summary>
CHUNK_EOC = 4
}
#endregion
#region Internal IFilter Enumerations
/// <summary>
/// Types of properties returned by IFilter
/// </summary>
internal enum PROPSPECKIND : int
{
/// <summary>
/// Property is a string
/// </summary>
PRSPEC_LPWSTR = 0,
/// <summary>
/// Property is a property id
/// </summary>
PRSPEC_PROPID = 1
}
/// <summary>
/// Types of chunks returned by IFilter
/// </summary>
internal enum CHUNKSTATE : int
{
/// <summary>
/// The current chunk is a text-type property.
/// </summary>
CHUNK_TEXT = 0x1,
/// <summary>
/// The current chunk is a value-type property.
/// </summary>
CHUNK_VALUE = 0x2,
/// <summary>
/// Reserved
/// </summary>
CHUNK_FILTER_OWNED_VALUE = 0x4
}
#endregion
#region Internal IFilter Types
[StructLayoutAttribute(LayoutKind.Sequential)]
internal struct STAT_CHUNK
{
/// <summary>
/// The chunk identifier. Chunk identifiers must be unique for the
current instance of the IFilter interface.
/// Chunk identifiers must be in ascending order. The order in which
chunks are numbered should correspond to the order in which they appear
in the source document. Some search engines can take advantage of the
proximity of chunks of various properties. If so, the order in which
chunks with different properties are emitted will be important to the
search engine.
/// </summary>
public int idChunk;
/// <summary>
/// The type of break that separates the previous chunk from the current
chunk. Values are from the CHUNK_BREAKTYPE enumeration.
/// </summary>
[MarshalAs(UnmanagedType.U4)]
public CHUNK_BREAKTYPE breakType;
/// <summary>
/// Flags indicate whether this chunk contains a text-type or a
value-type property.
/// Flag values are taken from the CHUNKSTATE enumeration. If the
CHUNK_TEXT flag is set,
/// IFilter::GetText should be used to retrieve the contents of the chunk
as a series of words.
/// If the CHUNK_VALUE flag is set, IFilter::GetValue should be used to
retrieve
/// the value and treat it as a single property value. If the filter
dictates that the same
/// content be treated as both text and as a value, the chunk should be
emitted twice in two
/// different chunks, each with one flag set.
/// </summary>
[MarshalAs(UnmanagedType.U4)]
public CHUNKSTATE flags;
/// <summary>
/// The language and sublanguage associated with a chunk of text. Chunk
locale is used
/// by document indexers to perform proper word breaking of text. If the
chunk is
/// neither text-type nor a value-type with data type VT_LPWSTR, VT_LPSTR
or VT_BSTR,
/// this field is ignored.
/// </summary>
public int locale;
/// <summary>
/// The property to be applied to the chunk. If a filter requires that
the same text
/// have more than one property, it needs to emit the text once for each
property
/// in separate chunks.
/// </summary>
public FULLPROPSPEC attribute;
/// <summary>
/// The ID of the source of a chunk. The value of the idChunkSource
member depends on the nature of the chunk:
/// If the chunk is a text-type property, the value of the idChunkSource
member must be the same as the value of the idChunk member.
/// If the chunk is an internal value-type property derived from textual
content, the value of the idChunkSource member is the chunk ID for the
text-type chunk from which it is derived.
/// If the filter attributes specify to return only internal value-type
properties, there is no content chunk from which to derive the current
internal value-type property. In this case, the value of the
idChunkSource member must be set to zero, which is an invalid chunk.
/// </summary>
public int idChunkSource;
/// <summary>
/// The offset from which the source text for a derived chunk starts in
the source chunk.
/// </summary>
public int cwcStartSource;
/// <summary>
/// The length in characters of the source text from which the current
chunk was derived.
/// A zero value signifies character-by-character correspondence between
the source text and
/// the derived text. A nonzero value means that no such direct
correspondence exists
/// </summary>
public int cwcLenSource;
}
[StructLayoutAttribute(LayoutKind.Sequential)]
internal struct FULLPROPSPEC
{
public Guid guidPropSet;
public PROPSPEC psProperty;
}
[StructLayoutAttribute(LayoutKind.Sequential)]
internal struct PROPSPEC
{
[MarshalAs(UnmanagedType.U4)]
public PROPSPECKIND ulKind; // PRSPEC_LPWSTR or PRSPEC_PROPID
public IntPtr data;
}
#endregion
#region TextFilterException
/// <summary>
/// Exception type used to throw exceptions occuring during Text Filter
operations
/// </summary>
public class TextFilterException : System.Exception
{
/// <summary>
/// Constructs a new, blank TextFilterException
/// </summary>
public TextFilterException() : base()
{
}
/// <summary>
/// Constructs a new TextFilterException with the specified error message
/// </summary>
/// <param name="msg">Error Message</param>
public TextFilterException(string msg) : base(msg)
{
}
/// <summary>
/// Constructs a new TextFilterException with the specified error message
/// containing the specified inner exception.
/// </summary>
/// <param name="msg">Error Message</param>
/// <param name="innerException">Inner Exception</param>
public TextFilterException(string msg, Exception innerException) : base
(msg, innerException)
{
}
}
#endregion
#region TextFilter
/// <summary>
/// A Managed Code class for invoking an Indexing Service IFilter
/// object on a document to convert it to a text only representation.
/// </summary>
public class TextFilter
{
#region Unmanaged Code
[DllImport("query.dll", CharSet = CharSet.Unicode)]
private extern static int LoadIFilter(
string pwcsPath,
ref IUnknown pUnkOuter,
ref IFilter ppIUnk
);
[DllImport("iprop.dll", CharSet = CharSet.Unicode)]
private extern static int PropVariantClear (IntPtr pvar );
#endregion
#region Com Interop for IUnknown
[ComImport, Guid("00000000-0000-0000-C000-000000000046")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
private interface IUnknown
{
[PreserveSig]
IntPtr QueryInterface(ref Guid riid, out IntPtr pVoid);
[PreserveSig]
IntPtr AddRef();
[PreserveSig]
IntPtr Release();
}
#endregion
#region Com Interop for IFilter
[ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
private interface IFilter
{
/// <summary>
/// The IFilter::Init method initializes a filtering session.
/// </summary>
[PreserveSig]
IFilterReturnCodes Init(
//[in] Flag settings from the IFILTER_INIT enumeration for
controlling text standardization, property output, embedding
scope, and IFilter access patterns.
[MarshalAs(UnmanagedType.U4)]
IFILTER_INIT grfFlags,
// [in] The size of the attributes array. When nonzero, cAttributes
takes
// precedence over attributes specified in grfFlags. If no
attribute flags
// are specified and cAttributes is zero, the default is given by
the
// PSGUID_STORAGE storage property set, which contains the date and
time
// of the last write to the file, size, and so on; and by the
PID_STG_CONTENTS
// 'contents' property, which maps to the main contents of the
file.
// For more information about properties and property sets, see
Property Sets.
int cAttributes,
//[in] Array of pointers to FULLPROPSPEC structures for the
requested properties.
// When cAttributes is nonzero, only the properties in aAttributes
are returned.
// pdwFlags
IntPtr aAttributes,
// [out] Information about additional properties available to the
caller; from the IFILTER_FLAGS enumeration.
[MarshalAs(UnmanagedType.U4)]
ref IFILTER_FLAGS pdwFlags);
/// <summary>
/// The IFilter::GetChunk method positions the filter at the beginning
of the next chunk,
/// or at the first chunk if this is the first call to the GetChunk
method, and returns a description of the current chunk.
/// </summary>
[PreserveSig]
IFilterReturnCodes GetChunk(
ref STAT_CHUNK pStat
);
/// <summary>
/// The IFilter::GetText method retrieves text (text-type properties)
from the current chunk,
/// which must have a CHUNKSTATE enumeration value of CHUNK_TEXT.
/// </summary>
[PreserveSig]
IFilterReturnCodes GetText(
// [in/out] On entry, the size of awcBuffer array in wide/Unicode
characters. On exit, the number of Unicode characters written to
awcBuffer.
// Note that this value is not the number of bytes in the buffer.
ref int pcwcBuffer,
// Text retrieved from the current chunk. Do not terminate the
buffer with a character.
[Out(), MarshalAs(UnmanagedType.LPWStr)]
StringBuilder awcBuffer
);
/// <summary>
/// The IFilter::GetValue method retrieves a value (internal
value-type property) from a chunk,
/// which must have a CHUNKSTATE enumeration value of CHUNK_VALUE.
/// </summary>
[PreserveSig]
IFilterReturnCodes GetValue(
// Allocate the PROPVARIANT structure with CoTaskMemAlloc. Some
PROPVARIANT
// structures contain pointers, which can be freed by calling the
PropVariantClear function.
// It is up to the caller of the GetValue method to call the
PropVariantClear method.
ref IntPtr ppPropValue
);
/// <summary>
/// The IFilter::BindRegion method retrieves an interface representing
the specified portion of the object.
/// Currently reserved for future use.
/// </summary>
[PreserveSig]
IFilterReturnCodes BindRegion(
ref FILTERREGION origPos,
ref Guid riid,
ref IUnknown ppunk
);
}
#endregion
#region Private IFilter Types
[StructLayoutAttribute(LayoutKind.Sequential, Pack=4, Size=0,
CharSet=CharSet.Auto)]
private struct PROPVARIANT
{
public Int16 vt;
public Int16 wReserved1;
public Int16 wReserved2;
public Int16 wReserved3;
public IntPtr data;
}
[StructLayoutAttribute(LayoutKind.Sequential)]
private struct FILTERREGION
{
public int idChunk;
public int cwcStart;
public int cwcExtent;
}
#endregion
#region Private IFilter Enumerations
private enum VariantTypes
{
/// <summary>
/// A property with a type indicator of VT_EMPTY has no data
associated with it; that is, the size of the value is zero.
/// </summary>
VT_EMPTY = 0,
/// <summary>
/// This is like a pointer to NULL.
/// </summary>
VT_NULL = 1,
/// <summary>
/// cVal 1-byte signed integer.
/// </summary>
VT_I1 = 16,
/// <summary>
/// bVal 1-byte unsigned integer.
/// </summary>
VT_UI1 = 17,
/// <summary>
/// Two bytes representing a 2-byte signed integer value.
/// </summary>
VT_I2 = 2,
/// <summary>
/// 2-byte unsigned integer.
/// </summary>
VT_UI2 = 18,
/// <summary>
/// 4-byte signed integer value.
/// </summary>
VT_I4 = 3,
/// <summary>
/// 4-byte signed integer value (equivalent to VT_I4).
/// </summary>
VT_INT = 22,
/// <summary>
/// 4-byte unsigned integer.
/// </summary>
VT_UI4 = 19,
/// <summary>
/// 4-byte unsigned integer (equivalent to VT_UI4).
/// </summary>
VT_UINT = 23,
/// <summary>
/// 8-byte signed integer.
/// </summary>
VT_I8 = 20,
/// <summary>
/// 8-byte unsigned integer.
/// </summary>
VT_UI8 = 21,
/// <summary>
/// 32-bit IEEE floating point value.
/// </summary>
VT_R4 = 4,
/// <summary>
/// 64-bit IEEE floating point value.
/// </summary>
VT_R8 = 5,
/// <summary>
/// 8-byte two's complement integer (scaled by 10,000). This type is
commonly used for currency amounts.
/// </summary>
VT_CY = 6,
/// <summary>
/// A 64-bit floating point number representing the number of days
(not seconds) since December 31, 1899. For example, January 1, 1900,
is 2.0, January 2, 1900, is 3.0, and so on). This is stored in the
same representation as VT_R8.
/// </summary>
VT_DATE = 7 ,
/// <summary>
/// bstrVal Pointer to a null-terminated Unicode string. The string is
immediately preceded
/// by a DWORD representing the byte count, but bstrVal points past
this DWORD to
/// the first character of the string. BSTRs must be allocated and
freed using the
/// Automation SysAllocString and SysFreeString calls.
/// </summary>
VT_BSTR = 8,
/// <summary>
/// (bool in earlier designs) Boolean value, a WORD containing 0
(FALSE) or -1 (TRUE).
/// </summary>
VT_BOOL = 11 ,
/// <summary>
/// A DWORD containing a status code.
/// </summary>
VT_ERROR= 10,
/// <summary>
/// filetime 64-bit FILETIME structure as defined by Win32. It is
recommended that all times be stored in Universal Coordinate Time
(UTC).
/// </summary>
VT_FILETIME = 64,
/// <summary>
/// Pointer to a null-terminated ANSI string in the system default
code page.
/// </summary>
VT_LPSTR = 30,
/// <summary>
/// Pointer to a null-terminated Unicode string in the user's default
locale.
/// </summary>
VT_LPWSTR = 31,
/// <summary>
/// Pointer to a class identifier (CLSID) (or other globally unique
identifier (GUID)).
/// </summary>
VT_CLSID = 72 ,
/// <summary>
/// Pointer to a clipdata structure
/// </summary>
VT_CF = 71,
/// <summary>
/// DWORD count of bytes, followed by that many bytes of data. The
byte
/// count does not include the four bytes for the length of the count
itself;
/// an empty blob member would have a count of zero, followed by zero
bytes.
/// This is similar to the value VT_BSTR but does not guarantee a null
byte at the end of the data.
/// </summary>
VT_BLOB = 65,
/// <summary>
/// A blob member containing a serialized object in the same
representation that would appear in VT_STREAMED_OBJECT.
/// That is, a DWORD byte count (where the byte count does not include
the size of itself) which is in the
/// format of a class identifier followed by initialization data for
that class.
/// The only significant difference between VT_BLOB_OBJECT and
VT_STREAMED_OBJECT is that the former does not
/// have the system-level storage overhead that the latter would have,
and is therefore more
/// suitable for scenarios involving numbers of small objects.
/// </summary>
VT_BLOBOBJECT = 70,
/// <summary>
/// pStream Pointer to an IStream interface, representing a stream
which is a sibling to the "Contents" stream.
/// </summary>
VT_STREAM = 66,
/// <summary>
/// pStream As in VT_STREAM, but indicates that the stream contains a
serialized object, which is a CLSID followed by initialization data
for the class. The stream is a sibling to the "Contents" stream that
contains the property set.
/// </summary>
VT_STREAMED_OBJECT = 68,
/// <summary>
/// pStorage Pointer to an IStorage interface, representing a storage
object that is a sibling to the "Contents" stream.
/// </summary>
VT_STORAGE = 67,
/// <summary>
/// pStorage As in VT_STORAGE, but indicates that the designated
IStorage contains a loadable object.
/// </summary>
VT_STORED_OBJECT = 69,
/// <summary>
/// decVal A DECIMAL structure.
/// </summary>
VT_DECIMAL= 14,
/// <summary>
/// ca* If the type indicator is combined with VT_VECTOR by using an
OR operator, the value is
/// one of the counted array values. This creates a DWORD count of
elements, followed by a
/// pointer to the specified repetitions of the value.
/// For example, a type indicator of VT_LPSTR|VT_VECTOR has a DWORD
element count,
/// followed by a pointer to an array of LPSTR elements.
/// VT_VECTOR can be combined by an OR operator with the following
types:
/// VT_I1, VT_UI1, VT_I2, VT_UI2, VT_BOOL, VT_I4, VT_UI4, VT_R4,
VT_R8, VT_ERROR, VT_I8,
/// VT_UI8, VT_CY, VT_DATE, VT_FILETIME, VT_CLSID, VT_CF, VT_BSTR,
VT_LPSTR, VT_LPWSTR, and VT_VARIANT.
/// </summary>
VT_VECTOR = 0x1000,
/// <summary>
/// If the type indicator is combined with VT_ARRAY by an OR operator,
the value is
/// a pointer to a SAFEARRAY. VT_ARRAY can use the OR with the
following data types:
/// VT_I1, VT_UI1, VT_I2, VT_UI2, VT_I4, VT_UI4, VT_INT, VT_UINT,
VT_R4, VT_R8, VT_BOOL,
/// VT_DECIMAL, VT_ERROR, VT_CY, VT_DATE, and VT_BSTR. VT_ARRAY cannot
use OR with VT_VECTOR.
/// </summary>
VT_ARRAY = 0x2000,
/// <summary>
/// If the type indicator is combined with VT_BYREF by an OR operator,
the value is a reference.
/// Reference types are interpreted as a reference to data, similar to
the
/// reference type in C++ (for example, "int&").
/// VT_BYREF can use OR with the following types: VT_I1, VT_UI1,
VT_I2, VT_UI2, VT_I4, VT_UI4,
/// VT_INT, VT_UINT, VT_R4, VT_R8, VT_BOOL, VT_DECIMAL, VT_ERROR,
VT_CY, VT_DATE,
/// VT_BSTR, VT_ARRAY, and VT_VARIANT.
/// </summary>
VT_BYREF = 0x4000 ,
/// <summary>
/// A DWORD type indicator followed by the corresponding value.
VT_VARIANT can be used
/// only with VT_VECTOR or VT_BYREF.
/// </summary>
VT_VARIANT = 12,
/// <summary>
/// Used as a mask for VT_VECTOR and other modifiers to extract the
raw VT value.
/// </summary>
VT_TYPEMASK = 0xFFF
}
[Flags]
private enum IFILTER_FLAGS : int
{
/// <summary>
/// The caller should use the IPropertySetStorage and IPropertyStorage
interfaces to locate additional properties.
/// When this flag is set, properties available through COM
enumerators should not be returned from IFilter.
/// </summary>
IFILTER_FLAGS_OLE_PROPERTIES = 1
}
private enum IFilterReturnCodes : uint
{
/// <summary>
/// Success
/// </summary>
S_OK = 0,
/// <summary>
/// The function was denied access to the filter file.
/// </summary>
E_ACCESSDENIED = 0x80070005,
/// <summary>
/// The function encountered an invalid handle, probably due to a
low-memory situation.
/// </summary>
E_HANDLE = 0x80070006,
/// <summary>
/// The function received an invalid parameter.
/// </summary>
E_INVALIDARG = 0x80070057,
/// <summary>
/// Out of memory
/// </summary>
E_OUTOFMEMORY = 0x8007000E,
/// <summary>
/// Not implemented
/// </summary>
E_NOTIMPL = 0x80004001,
/// <summary>
/// Unknown error
/// </summary>
E_FAIL = 0x80000008,
/// <summary>
/// File not filtered due to password protection
/// </summary>
FILTER_E_PASSWORD = 0x8004170B,
/// <summary>
/// The document format is not recognised by the filter
/// </summary>
FILTER_E_UNKNOWNFORMAT = 0x8004170C,
/// <summary>
/// No text in current chunk
/// </summary>
FILTER_E_NO_TEXT = 0x80041705,
/// <summary>
/// No more chunks of text available in object
/// </summary>
FILTER_E_END_OF_CHUNKS = 0x80041700,
/// <summary>
/// No more text available in chunk
/// </summary>
FILTER_E_NO_MORE_TEXT = 0x80041701,
/// <summary>
/// No more property values available in chunk
/// </summary>
FILTER_E_NO_MORE_VALUES = 0x80041702,
/// <summary>
/// Unable to access object
/// </summary>
FILTER_E_ACCESS = 0x80041703,
/// <summary>
/// Moniker doesn't cover entire region
/// </summary>
FILTER_W_MONIKER_CLIPPED = 0x00041704,
/// <summary>
/// Unable to bind IFilter for embedded object
/// </summary>
FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
/// <summary>
/// Unable to bind IFilter for linked object
/// </summary>
FILTER_E_LINK_UNAVAILABLE = 0x80041708,
/// <summary>
/// This is the last text in the current chunk
/// </summary>
FILTER_S_LAST_TEXT = 0x00041709,
/// <summary>
/// This is the last value in the current chunk
/// </summary>
FILTER_S_LAST_VALUES = 0x0004170A
}
#endregion
#region Member Variables
private TextFilterItems m_textItems = null;
private TextFilterPropertyItems m_propertyItems = null;
private string m_fileName = "";
#endregion
#region Implementation
/// <summary>
/// Gets the Text chunks returned by the TextFilter
/// </summary>
public TextFilterItems TextItems
{
get
{
return m_textItems;
}
}
/// <summary>
/// Gets the property items returned by the TextFilter
/// </summary>
public TextFilterPropertyItems PropertyItems
{
get
{
return m_propertyItems;
}
}
/// <summary>
/// Gets the file name which has been filtered
/// </summary>
public string FileName
{
get
{
return m_fileName;
}
}
#endregion
#region Constructors, Dispose
/// <summary>
/// Constructs a new instance of a TextFilter and
/// parses the specified file using the default flags
/// (search links and apply all attributes)
/// </summary>
/// <param name="file">File to filter</param>
/// <exception cref="TextFilterException">If the filter cannot be created
or initialised for this instance</exception>
public TextFilter(string file) :
this(file,
IFILTER_INIT.IFILTER_INIT_SEARCH_LINKS |
IFILTER_INIT.IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES |
IFILTER_INIT.IFILTER_INIT_APPLY_INDEX_ATTRIBUTES |
IFILTER_INIT.IFILTER_INIT_APPLY_OTHER_ATTRIBUTES)
{
}
/// <summary>
/// Constructs a new instance of TextFilter and parses
/// the specified file, using the selected filter flags
/// </summary>
/// <param name="file">File to filter</param>
/// <param name="flags">IFilter initialisation flags</param>
/// <exception cref="TextFilterException">If the filter cannot be created
or initialised for this instance</exception>
public TextFilter(string file, IFILTER_INIT flags)
{
IFilter iflt = null;
IUnknown iunk = null;
int i = LoadIFilter(file, ref iunk, ref iflt);
if (i != (int)IFilterReturnCodes.S_OK)
{
throw new TextFilterException(
String.Format("IFilter instance not found for file {0}", file));
}
IFilterReturnCodes scode;
ArrayList textItems = new ArrayList();
int attr = 0;
IFILTER_FLAGS flagsSet = 0;
scode = iflt.Init(flags, attr, IntPtr.Zero, ref flagsSet);
if (scode != IFilterReturnCodes.S_OK)
{
throw new TextFilterException(
String.Format("IFilter initialisation failed: {0}",
Enum.GetName(scode.GetType(), scode)));
}
int bufSize = 65536;
StringBuilder buf = new StringBuilder(bufSize, bufSize);
STAT_CHUNK stat = new STAT_CHUNK();
while (scode == IFilterReturnCodes.S_OK)
{
scode = iflt.GetChunk(ref stat);
if (scode == IFilterReturnCodes.S_OK)
{
//Console.WriteLine("New Chunk: {0}\n--------------------",
stat.idChunk);
if (stat.flags == CHUNKSTATE.CHUNK_TEXT)
{
bufSize = 65536;
IFilterReturnCodes scodeText = iflt.GetText(
ref bufSize,
buf);
if (bufSize > 0)
{
TextFilterItem ti = new TextFilterItem(
stat,
buf.ToString(0, bufSize));
textItems.Add(ti);
}
}
else if (stat.flags == CHUNKSTATE.CHUNK_VALUE)
{
/*
Console.WriteLine(stat.attribute.guidPropSet.ToString());
Console.WriteLine(stat.attribute.psProperty.ulKind.ToString())
;
if (stat.attribute.psProperty.ulKind ==
PROPSPECKIND.PRSPEC_LPWSTR)
{
string property =
Marshal.PtrToStringUni(stat.attribute.psProperty.data);
Console.WriteLine("String Property {0}", property);
}
else
{
PROPID propId =
(PROPID)((int)stat.attribute.psProperty.data);
Console.WriteLine(propId);
}
IFilterReturnCodes scodeGetValue = IFilterReturnCodes.S_OK;
IntPtr valuePtr = IntPtr.Zero;
scodeGetValue = iflt.GetValue(ref valuePtr);
if (scodeGetValue == IFilterReturnCodes.S_OK)
{
Console.WriteLine(valuePtr);
}
else
{
Console.WriteLine("Call Failed");
}
*/
}
}
}
this.m_textItems = new TextFilterItems(textItems);
}
#endregion
}
#endregion
#region TextFilterItems
/// <summary>
/// Collection of TextFilterItem chunks returned by
/// the TextFilter class
/// </summary>
public class TextFilterItems : ReadOnlyCollectionBase
{
/// <summary>
/// Gets the Text Filter Item with the specified index
/// </summary>
public TextFilterItem this[int index]
{
get
{
return (TextFilterItem)this.InnerList[index];
}
}
/// <summary>
/// Internal constructor for the Text Filter Items
/// Collection, used by the TextFilter class
/// </summary>
/// <param name="items">ArrayList containing
/// TextFilterItems to add</param>
internal TextFilterItems(ArrayList items)
{
foreach (TextFilterItem ti in items)
{
this.InnerList.Add(ti);
}
}
}
#endregion
#region TextFilterItem
/// <summary>
/// Maintains a chunk of text extracted by the Text Filter.
/// </summary>
public class TextFilterItem
{
#region Member Variables
private STAT_CHUNK m_stat;
private string m_text;
#endregion
/// <summary>
/// The chunk identifier for this piece of text from the
/// TextFilter
/// </summary>
public int Id
{
get
{
return m_stat.idChunk;
}
}
/// <summary>
/// Gets the type of break which separates this piece of
/// text from the previous piece.
/// </summary>
public CHUNK_BREAKTYPE BreakType
{
get
{
return m_stat.breakType;
}
}
/// <summary>
/// The offset from which the source text for a derived chunk starts in
the source chunk
/// </summary>
public int SourceStartOffset
{
get
{
return m_stat.cwcStartSource;
}
}
/// <summary>
/// The length, in characters, of the source region from which this text
chunk was
/// derived, or 0 if no correspondence exists between this and the source.
/// </summary>
public int SourceLength
{
get
{
return m_stat.cwcLenSource;
}
}
/// <summary>
/// The locale identifier for this chunk of text. Same as the LCID
/// in the System.Globalization.CultureInfo object.
/// </summary>
public int LCID
{
get
{
return m_stat.locale;
}
}
/// <summary>
/// Returns the Guid for this chunk
/// </summary>
public Guid ItemGuid
{
get
{
return m_stat.attribute.guidPropSet;
}
}
/// <summary>
/// Returns the property id of this text if it corresponds to one
/// of the standard properties, or PROPID.PID_UNKNOWN otherwise
/// </summary>
public PROPID PropertyId
{
get
{
PROPID ret = PROPID.PID_UNKNOWN;
if (m_stat.attribute.psProperty.ulKind ==
PROPSPECKIND.PRSPEC_PROPID)
{
ret = (PROPID)((int)m_stat.attribute.psProperty.data);
}
return ret;
}
}
/// <summary>
/// Returns the name of the property id of this text.
/// </summary>
public string PropertyName
{
get
{
string ret = "";
if (m_stat.attribute.psProperty.ulKind ==
PROPSPECKIND.PRSPEC_PROPID)
{
ret = Enum.GetName(m_stat.attribute.psProperty.ulKind.GetType(),
(int)m_stat.attribute.psProperty.data);
}
else
{
ret = Marshal.PtrToStringUni(m_stat.attribute.psProperty.data);
}
return ret;
}
}
/// <summary>
/// Returns the text for this chunk
/// </summary>
public string Text
{
get
{
return m_text;
}
}
/// <summary>
/// Internal constructor for the a Text Filter Item,
/// used by the TextFilter class
/// </summary>
/// <param name="stat">STAT_CHUNK for this piece of text</param>
/// <param name="text">Text</param>
internal TextFilterItem(
STAT_CHUNK stat,
string text
)
{
m_stat = stat;
m_text = text;
}
}
#endregion
#region TextFilterPropertyItems
/// <summary>
/// Maintains a collection of property items extracted by the
/// TextFilter
/// </summary>
public class TextFilterPropertyItems : ReadOnlyCollectionBase
{
/// <summary>
/// Gets the text filter property item with the specified index
/// </summary>
public TextFilterPropertyItem this[int index]
{
get
{
return (TextFilterPropertyItem)this.InnerList[index];
}
}
/// <summary>
/// Internal constructor for a Text Filter Property Item
/// collection, used by the TextFilter class.
/// </summary>
internal TextFilterPropertyItems()
{
}
}
#endregion
#region TextFilterPropertyItem
/// <summary>
/// Maintains a Property item extracted by the TextFilter
/// </summary>
public class TextFilterPropertyItem
{
/// <summary>
/// Internal constructor for a Text Filter Property Item,
/// used by the TextFilter class.
/// </summary>
internal TextFilterPropertyItem()
{
}
}
#endregion
}
|
|